In [2]:
import pandas as pd
import torch
from torch import nn, Tensor, tensor, LongTensor
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.nn.functional as F
from pathlib import Path
import spacy
import pickle
import random
import math
import re
import numpy as np
import html
from collections import Counter
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor
from ast import literal_eval
import warnings
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


Inspired by FastAI implementation of AWD LSTM (https://github.com/fastai/course-v3/blob/master/nbs/dl2/12a_awd_lstm.ipynb)

In [23]:
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p=p

    def forward(self, x):
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

In [24]:

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)

    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

In [25]:
class EmbeddingDropout(nn.Module):
    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb,self.embed_p = emb,embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [26]:
class AWD_LSTM(nn.Module):
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs = 1
        self.emb_sz = emb_sz
        self.n_hid = n_hid
        self.n_layers = n_layers
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1,
                             batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

        
    def forward(self, input):
        bs,sl = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output) 
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs

    def _one_hidden(self, l):
        "Return one hidden state."
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

In [27]:
class LinearDecoder(nn.Module):
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True):
        super().__init__()
        self.output_dp = RNNDropout(output_p)
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight
        else: init.kaiming_uniform_(self.decoder.weight)

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1]).contiguous()
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [28]:
class SequentialRNN(nn.Sequential):
    "A sequential module that passes the reset call to its children."
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
    rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token, hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = rnn_enc.emb if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))


def cross_entropy_flat(input, target):
    bs,sl = target.size()
    return F.cross_entropy(input.view(bs * sl, -1), target.view(bs * sl))

def accuracy(out, yb): 
    return (torch.argmax(out, dim=1)==yb).float().mean()

def accuracy_flat(input, target):
    bs,sl = target.size()
    return accuracy(input.view(bs * sl, -1), target.view(bs * sl))

In [0]:
emb_sz = 300 
nh = 300
nl =  2
vocab = {}
vocab['xxxpad'] = 0
vocab['test'] = 1
tok_pad = 0
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, input_p=0.6, output_p=0.4, weight_p=0.5, 
                           embed_p=0.1, hidden_p=0.2)

Transfer learning

In [0]:
#TODO: get the pretrained wikitext weights

Preprocess our data

In [0]:
base_path = Path('drive/My Drive/datasets')
df = pd.read_csv(base_path/'rspct.tsv', sep = '\t')
remove_n=800000
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df = df.drop(drop_indices)

In [0]:
df

Unnamed: 0,id,subreddit,title,selftext
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."
7,6icvfu,hockeyplayers,Inline Hockey: Where Do I Need To Be? (Positio...,My game is coming on well but one HUGE aspect ...
9,6azhj1,rawdenim,Had a custom embroidery job done on my ranch j...,[Album First](http://imgur.com/a/DYdKC)<lb><lb...
14,7i2bt4,NameThatSong,Can anyone help me find this band?,Thinking of 'who the band sounded like' brings...
15,6zltab,homeless,"About to be homeless in LA,CA",My parents have cost me my recent job and are ...
16,4zpvar,antidepressants,Sexual performance anxiety,Throwaway here. I've been on sertraline 100mg ...
19,8kuivg,sissyhypno,Regrets,So i finally got my first full body waxing don...
30,7cafsz,foreskin_restoration,Huge layer of skin peeled off from glans,WOW! came back home from a hunting trip and ha...
36,6ht1be,driving,[UK] Name change before getting full license,My provisional license has a minor spelling er...


In [8]:
#simple preprocess (lower case, remove weird symbols)
UNK = 'xxxunk'
BOS = 'xxxbos'
EOS = 'xxxeos'
def get_corpus(df):
    corpus = ''
    for i in range(len(df.index)):
        corpus += BOS +' '+ df.iloc[i, -1] + ' ' + df.iloc[i, -2] + ' ' + EOS
    return corpus

def to_lower(t):
    return t.lower()
        

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub(" ", t)


def sub_lb(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*lb\s*/?>', re.IGNORECASE)
    return re_br.sub(" ", t)

def sub_nl(t):
    t.replace('\n',' ')
    return t

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', " ").replace('\\"', '"').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

#tokenize
def tokenize(corpus, vocab = None):
    tokenizer = spacy.blank("en").tokenizer
    doc = tokenizer(corpus)
    tokens = []
    for token in doc:
        if(token.text.strip() != ""):
            if(vocab != None and token.text not in vocab):
                tokens.append('xxxunk')
            else:
                tokens.append(token.text)
    return tokens


#generate vocab(REMEMBER TOKENS(eos, bos, pad, unk))
TITLE_START = 'xxxts'
TITLE_END = 'xxxte'

def generate_vocab(df, min_freq = 2):
    all_unique_words_counter = Counter()
    i = 0
    for index, row in df.iterrows():
        if(i % 100 == 0):
            print("done " + str(i) + "/" + str(len(df)))
        all_unique_words_counter += Counter(row[-1]) + Counter(row[-2])
        i += 1
    
    
        
    vocab = {}
    vocab['xxxpad'] = 0
    vocab['xxxunk'] = 1
    index = 2
    print("counter len: " + str(len(all_unique_words_counter)))
    i = 0
    for w in all_unique_words_counter.keys():
        if(i % 100 == 0):
            print('done 2nd loop ' + str(i) + '/' + str(len(all_unique_words_counter)))
        if(all_unique_words_counter[w] >= min_freq and w.strip() != ""):
            vocab[w] = index
            index += 1
        i+=1
    return vocab

#create language model dataloaders and stuff



In [0]:
corpus = get_corpus(df)
with open(base_path/'corpus.txt', 'w+') as f:
    f.write(corpus)

In [0]:
corpus = None
with open(base_path/'corpus.txt', 'r') as f:
    corpus = f.read()


In [0]:
corpus[:200]

"xxxbos I know this is a sub for the 'Ring Doorbell' but has anyone used the Floodlight?  I already have the wire and existing bracket for the floodlight on the back of my house, but the problem is tha"

In [0]:
df.head()

Unnamed: 0,id,subreddit,title,selftext
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."
7,6icvfu,hockeyplayers,Inline Hockey: Where Do I Need To Be? (Positio...,My game is coming on well but one HUGE aspect ...
9,6azhj1,rawdenim,Had a custom embroidery job done on my ranch j...,[Album First](http://imgur.com/a/DYdKC)<lb><lb...
14,7i2bt4,NameThatSong,Can anyone help me find this band?,Thinking of 'who the band sounded like' brings...


In [0]:
preprocess_funcs = [sub_br, sub_lb, spec_add_spaces, rm_useless_spaces, fixup_text, to_lower, sub_nl]

In [79]:
table = []
EOS = 'xxxeos'
BOS = 'xxxbos'
TITLE_START = 'xxxts'
TITLE_END = 'xxxte'
prog = 0
for index, row in df.iterrows():
    title = row[-2]
    selftext = row[-1]
    
    if(prog % 100 == 0):
        print('done ' + str(prog) + ' out of ' + str(len(df)))
  
    for f in preprocess_funcs:
        title = f(title)
        selftext = f(selftext)
    
    title_tokens = tokenize(title)
    selftext_tokens = tokenize(selftext)
    
    title_tokens = [TITLE_START] + title_tokens + [TITLE_END]
    selftext_tokens = [BOS] + selftext_tokens +  [EOS]
    
    table.append([row[0], row[1], title_tokens, selftext_tokens])
    prog +=1 
   
    

good_df = pd.DataFrame(table, columns = ['id', 'subreddit', 'title', 'selftext'])
good_df.to_csv(base_path/'processed_data.csv', index = False)

done 0 out of 213000
done 100 out of 213000
done 200 out of 213000
done 300 out of 213000
done 400 out of 213000
done 500 out of 213000
done 600 out of 213000
done 700 out of 213000
done 800 out of 213000
done 900 out of 213000
done 1000 out of 213000
done 1100 out of 213000
done 1200 out of 213000
done 1300 out of 213000
done 1400 out of 213000
done 1500 out of 213000
done 1600 out of 213000
done 1700 out of 213000
done 1800 out of 213000
done 1900 out of 213000
done 2000 out of 213000
done 2100 out of 213000
done 2200 out of 213000
done 2300 out of 213000
done 2400 out of 213000
done 2500 out of 213000
done 2600 out of 213000
done 2700 out of 213000
done 2800 out of 213000
done 2900 out of 213000
done 3000 out of 213000
done 3100 out of 213000
done 3200 out of 213000
done 3300 out of 213000
done 3400 out of 213000
done 3500 out of 213000
done 3600 out of 213000
done 3700 out of 213000
done 3800 out of 213000
done 3900 out of 213000
done 4000 out of 213000
done 4100 out of 213000
done

KeyboardInterrupt: ignored

In [0]:
good_df = pd.DataFrame(table, columns = ['id', 'subreddit', 'title', 'selftext'])
good_df.to_csv(base_path/'processed_data.csv', index = False)

In [81]:
good_df.head()

Unnamed: 0,id,subreddit,title,selftext
0,6ti6re,ringdoorbell,"[xxxts, not, door, bell, ,, but, floodlight, m...","[xxxbos, i, know, this, is, a, sub, for, the, ..."
1,77sxto,intel,"[xxxts, worried, about, my, 8700k, small, fft,...","[xxxbos, prime95, (, regardless, of, version, ..."
2,6icvfu,hockeyplayers,"[xxxts, inline, hockey, :, where, do, i, need,...","[xxxbos, my, game, is, coming, on, well, but, ..."
3,6azhj1,rawdenim,"[xxxts, had, a, custom, embroidery, job, done,...","[xxxbos, [, album, first](http, :, /, /, imgur..."
4,7i2bt4,NameThatSong,"[xxxts, can, anyone, help, me, find, this, ban...","[xxxbos, thinking, of, ', who, the, band, soun..."


In [7]:
def converter(x):
    #convert "list" to list
    return literal_eval(x)
base_path = Path('storage/htn2019')
converters={'title': converter, 'selftext': converter}
df = pd.read_csv(base_path/'processed_data.csv', converters = converters)

In [9]:
def split_df_classification(df, vocab, category_encoder, pct = 0.1):
    index = int(len(df) * pct)
    train_df = df.iloc[0:(len(df) - index), :]
    valid_df = df.iloc[(len(df) - index):, :]
    
    train_txt_ds = ClassificationDataset(train_df, vocab, category_encoder)
    valid_txt_ds = ClassificationDataset(valid_df, vocab, category_encoder)
    
    return train_txt_ds, valid_txt_ds

In [86]:
test_df.iloc[0, -1]

['xxxbos',
 'i',
 'know',
 'this',
 'is',
 'a',
 'sub',
 'for',
 'the',
 "'",
 'ring',
 'doorbell',
 "'",
 'but',
 'has',
 'anyone',
 'used',
 'the',
 'floodlight',
 '?',
 'i',
 'already',
 'have',
 'the',
 'wire',
 'and',
 'existing',
 'bracket',
 'for',
 'the',
 'floodlight',
 'on',
 'the',
 'back',
 'of',
 'my',
 'house',
 ',',
 'but',
 'the',
 'problem',
 'is',
 'that',
 'it',
 "'s",
 'about',
 '12',
 'feet',
 'above',
 'ground',
 'level',
 '(',
 '10',
 'ft',
 'above',
 'the',
 'deck',
 ',',
 '2',
 'ft',
 'drop',
 'from',
 'the',
 'deck',
 'down',
 'to',
 'the',
 'grass',
 ')',
 'is',
 'that',
 'too',
 'high',
 'to',
 'mount',
 '?',
 'the',
 'website',
 'says',
 '9',
 'ft',
 'is',
 'ideal',
 '.',
 'anyone',
 'had',
 'any',
 'problems',
 'mounting',
 'it',
 'higher',
 'than',
 'that',
 '?',
 'xxxeos']

In [0]:
with open(base_path/'corpus.txt', 'w+') as f:
    f.write(corpus)

In [0]:
corpus[:100]

"xxxbos i know this is a sub for the 'ring doorbell' but has anyone used the floodlight? i already ha"

In [0]:
corpus = None
with open(base_path/'corpus.txt', 'r') as f:
    corpus = f.read()

In [0]:
UNK = 'xxxunk'
PAD = 'xxxpad'
BOS = 'xxxbos'
EOS = 'xxxeos'
default_spec_tok = [UNK, PAD, BOS, EOS]


def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(map(func, enumerate(arr)), total=len(arr))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(ex.map(func, enumerate(arr)), total=len(arr))
    if any([o is not None for o in results]): return results

class Processor(): 
    def process(self, items): return items
    
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        #self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        #self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        #chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        #docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        if isinstance(items[0], Path): items = [read_file(i) for i in items]
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [107]:
vocab = generate_vocab(good_df)

done 0/95331
done 100/95331
done 200/95331
done 300/95331
done 400/95331
done 500/95331
done 600/95331
done 700/95331
done 800/95331
done 900/95331
done 1000/95331
done 1100/95331
done 1200/95331
done 1300/95331
done 1400/95331
done 1500/95331
done 1600/95331
done 1700/95331
done 1800/95331
done 1900/95331
done 2000/95331
done 2100/95331
done 2200/95331
done 2300/95331
done 2400/95331
done 2500/95331
done 2600/95331
done 2700/95331
done 2800/95331
done 2900/95331
done 3000/95331
done 3100/95331
done 3200/95331
done 3300/95331
done 3400/95331
done 3500/95331
done 3600/95331
done 3700/95331
done 3800/95331
done 3900/95331
done 4000/95331
done 4100/95331
done 4200/95331
done 4300/95331
done 4400/95331
done 4500/95331
done 4600/95331
done 4700/95331
done 4800/95331
done 4900/95331
done 5000/95331
done 5100/95331
done 5200/95331
done 5300/95331
done 5400/95331
done 5500/95331
done 5600/95331
done 5700/95331
done 5800/95331
done 5900/95331
done 6000/95331
done 6100/95331
done 6200/95331
done

In [108]:
len(vocab)

101814

In [0]:
with open(base_path/'my_vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

Train pretrained model with our data

In [24]:
class LM_PreLoader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
        total_len = sum([len(t) for t in data.data])
        self.n_batch = total_len // bs
        self.batchify()
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
    
    def batchify(self):
        texts = self.data.data
        if self.shuffle: texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)
        


In [10]:
def split_df(df, vocab, pct = 0.1):
    index = int(len(df) * pct)
    train_df = df.iloc[0:(len(df) - index), :]
    valid_df = df.iloc[(len(df) - index):, :]
    
    train_txt_ds = TextDataset(train_df, vocab)
    valid_txt_ds = TextDataset(valid_df, vocab)
    
    return train_txt_ds, valid_txt_ds

In [20]:
class TextDataset(Dataset):
    
    def __init__(self, df, vocab):
        self.data = []
        for index, row in df.iterrows():
            title_nums = []
            body_nums = []
            for x in row[-2]:
                if(x in vocab):
                    title_nums.append(vocab[x])
                else:
                    title_nums.append(vocab['xxxunk'])
                    
            for x in row[-1]:
                if(x in vocab):
                    body_nums.append(vocab[x])
                else:
                    body_nums.append(vocab['xxxunk'])
            self.data.append(title_nums + body_nums)
            
            
        
        
    def __getitem__(self, i):
        return self.data[i]
        
    def __len__(self):
        return len(self.data)

In [145]:
train, valid = split_df(good_df, vocab)

ValueError: ignored

In [120]:
len(x), len(y)

(85798, 9533)

In [0]:
with open(base_path/'train_txt_ds.pkl', 'wb') as f:
    pickle.dump(train, f)
    
with open(base_path/'valid_txt_ds.pkl', 'wb') as f:
    pickle.dump(valid, f)

In [0]:
lm_preloader = LM_PreLoader(valid)

In [0]:
dl = DataLoader(LM_PreLoader(valid, shuffle=False), batch_size=64)
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
x2,y2 = next(iter_dl)

In [22]:
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=False), batch_size=bs, **kwargs), #TODO: try to get shuffle = True to work
            DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))


NameError: name 'base_path' is not defined

In [21]:
lm_train_dl, lm_valid_dl = get_lm_dls(train, valid, 32, 50)

NameError: name 'get_lm_dls' is not defined

Get pretrained LM

In [7]:
base_path = Path('storage/htn2019')

In [16]:
old_wgts  = torch.load(base_path/'pretrained.pth')
old_vocab = pickle.load(open(base_path/'vocab.pkl', 'rb'))

In [16]:
vocab = None
with open(base_path/'my_vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
dps = tensor([0.1, 0.15, 0.25, 0.02, 0.2]) * 0.5
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, vocab['xxxpad'], *dps)

In [17]:
def match_embeds(old_wgts, old_vocab, new_vocab):
    wgts = old_wgts['0.emb.weight']
    bias = old_wgts['1.decoder.bias']
    wgts_m,bias_m = wgts.mean(dim=0),bias.mean()
    new_wgts = wgts.new_zeros(len(new_vocab), wgts.size(1))
    new_bias = bias.new_zeros(len(new_vocab))
    otoi = {v:k for k,v in enumerate(old_vocab)}
    for i,w in enumerate(new_vocab): 
        if w in otoi:
            idx = otoi[w]
            new_wgts[i],new_bias[i] = wgts[idx],bias[idx]
        else: new_wgts[i],new_bias[i] = wgts_m,bias_m
    old_wgts['0.emb.weight']    = new_wgts
    old_wgts['0.emb_dp.emb.weight'] = new_wgts
    old_wgts['1.decoder.weight']    = new_wgts
    old_wgts['1.decoder.bias']      = new_bias
    return old_wgts

In [18]:
wgts = match_embeds(old_wgts, old_vocab, vocab)

In [24]:
model.load_state_dict(wgts)

<All keys matched successfully>

In [20]:
model.load_state_dict(torch.load(base_path/'lm_preunfreeze.pt'))

RuntimeError: unexpected EOF, expected 23633776 more bytes. The file might be corrupted.

In [0]:
def lm_splitter(m):
    groups = []
    for i in range(len(m[0].rnns)): groups.append(nn.Sequential(m[0].rnns[i], m[0].hidden_dps[i]))
    groups += [nn.Sequential(m[0].emb, m[0].emb_dp, m[0].input_dp, m[1])]
    return [list(o.parameters()) for o in groups]

In [21]:
for rnn in model[0].rnns:
    for p in rnn.parameters(): p.requires_grad_(False)

In [0]:
#learn = Learner(model, data, cross_entropy_flat, opt_func=adam_opt(),
               # cb_funcs=cbs, splitter=lm_splitter)

In [43]:
def to_detach(h):
    "Detaches `h` from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)

def fit(model, train_dl, valid_dl, loss_func, opt, num_epochs, acc_func):
    model = model.cuda()
    a = 2.
    b = 1.
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        valid_acc = 0
        for xb, yb in train_dl:
            model.train()
            pred = model(xb.cuda())
            loss = loss_func(pred[0], yb.cuda())
           
            raw_out,out = pred[1], pred[2]
            if a != 0.:  
                loss += a * out[-1].float().pow(2).mean()
            
            if b != 0.:
                h = raw_out[-1]
                if len(h)>1: 
                    loss += b * (h[:,1:] - h[:,:-1]).float().pow(2).mean()
            
            train_loss += loss
            loss.backward()
            opt.step()
            
            #gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            opt.zero_grad()
            
        model.eval()
        with torch.no_grad():
            for xb, yb in valid_dl:
                pred = model(xb.cuda())
                loss = loss_func(pred[0], yb.cuda())
                valid_loss += loss
                acc = acc_func(pred[0], yb.cuda())
                valid_acc += acc
                
        print("Epoch {0} complete. Train loss: {1}. Valid loss {2}. Valid Accuracy {3}".format(epoch, train_loss / len(train_dl), valid_loss / len(valid_dl), valid_acc / len(valid_dl)))
            

In [33]:
dps = tensor([0.1, 0.15, 0.25, 0.02, 0.2]) * 0.5
emb_sz, nh, nl = 300, 300, 2

base_path = Path('storage/htn2019')
vocab = None
with open(base_path/'my_vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)

model = get_language_model(len(vocab), emb_sz, nh, nl, vocab['xxxpad'], *dps)
model.load_state_dict(wgts)


train = None
with open(base_path/'train_txt_ds.pkl', 'rb') as f:
    train = pickle.load(f)

valid = None
with open(base_path/'valid_txt_ds.pkl', 'rb') as f:
    valid = pickle.load(f)

lm_train_dl, lm_valid_dl = get_lm_dls(train, valid, 64, 70)
torch.cuda.empty_cache()
for rnn in model[0].rnns:
    for p in rnn.parameters(): p.requires_grad_(False)

        
fit(model, lm_train_dl, lm_valid_dl, cross_entropy_flat, torch.optim.Adam(model.parameters(), lr = 2e-2), 1, accuracy_flat)

Epoch 0 complete. Train loss: 5.264625549316406. Valid loss 5.260253429412842. Valid Accuracy 0.19097912311553955


KeyboardInterrupt: 

In [35]:
torch.save(model.state_dict(), base_path/'lm_before_unfreeze.pt')

In [36]:
for rnn in model[0].rnns:
    for p in rnn.parameters(): 
        p.requires_grad_(True)
       
fit(model, lm_train_dl, lm_valid_dl, cross_entropy_flat, torch.optim.Adam(model.parameters(), lr = 2e-2), 10, accuracy_flat)

Epoch 0 complete. Train loss: 4.9907379150390625. Valid loss 5.034150123596191. Valid Accuracy 0.20948079228401184
Epoch 1 complete. Train loss: 21.26679801940918. Valid loss 41.60452651977539. Valid Accuracy 0.0016559930518269539
Epoch 2 complete. Train loss: 32.195579528808594. Valid loss 28.752336502075195. Valid Accuracy 0.008455846458673477


KeyboardInterrupt: 

In [0]:
torch.save(model.state_dict(), base_path/'lm_fine_tuned.pt')

In [1]:
model.load_state_dict(torch.load(base_path/'lm_before_unfreeze.pt'))

NameError: name 'model' is not defined

torch.save(learn.model[0].state_dict(), path/'pre_enc.pth')

In [45]:
class ClassificationDataset(Dataset):
    
    def __init__(self, df, vocab, category_encoder):
        self.x = []
        self.y = []
        for index, row in df.iterrows():
            title_nums = []
            body_nums = []
            for x in row[-2]:
                if(x in vocab):
                    title_nums.append(vocab[x])
                else:
                    title_nums.append(vocab['xxxunk'])
                    
            for x in row[-1]:
                if(x in vocab):
                    body_nums.append(vocab[x])
                else:
                    body_nums.append(vocab['xxxunk'])
            self.x.append(title_nums + body_nums)
            self.y.append(category_encoder[row[1]])
        self.y = LongTensor(self.y)
            
    def __getitem__(self, i):
        return self.x[i], self.y[i]
        
    def __len__(self):
        return len(self.x)


    def pad_collate(self, pad_idx, pad_first=False):
        max_len = max([len(s) for s in self.x])
        res = torch.zeros(len(self.x), max_len).long() + pad_idx
        for i,s in enumerate(self.x):
            if pad_first: res[i, -len(s):] = LongTensor(s)
            else:         res[i, :len(s) ] = LongTensor(s)
        self.x = res

In [8]:
base_path = Path('storage/htn2019')

def create_category_encoder(categories):
    encoder = dict()
    i = 0
    for c in categories:
        encoder[c] = i
        i += 1
    return encoder

def get_categories(df):
    cats = set()
    for i, row in df.iterrows():
        cats.add(row[1])
    return cats

In [11]:
category_encoder = None
with open(base_path/'category_encoder.pkl', 'rb') as f:
    category_encoder = pickle.load(f)

In [64]:
train_ds = None
valid_ds = None
with open(base_path/'train_class_ds.pkl', 'rb') as f:
    train_ds = pickle.load(f)
    
with open(base_path/'valid_class_ds.pkl', 'rb') as f:
    valid_ds = pickle.load(f)

In [12]:
vocab = None
with open(base_path/'my_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [13]:
#valid_ds.pad_collate(vocab['xxxpad'])
#train_ds.pad_collate(vocab['xxxpad'])
def pad_collate(samples, pad_idx=0, pad_first=False):
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    for i,s in enumerate(samples):
        if pad_first: res[i, -len(s[0]):] = LongTensor(s[0])
        else:         res[i, :len(s[0]) ] = LongTensor(s[0])
    return res, tensor([s[1] for s in samples])

In [14]:
from torch.utils.data import Sampler

class SortSampler(Sampler):
    def __init__(self, data_source, key): self.data_source,self.key = data_source,key
    def __len__(self): return len(self.data_source)
    def __iter__(self):
        return iter(sorted(list(range(len(self.data_source))), key=self.key, reverse=True))

In [70]:
type(train_ds.x), type(valid_ds.x)

(list, torch.Tensor)

In [46]:
train_ds, valid_ds = split_df_classification(df, vocab, category_encoder)

train_sampler = SortSampler(train_ds.x, key=lambda t: len(train_ds.x[t]))
valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))

train_dl = DataLoader(train_ds, batch_size = 64, sampler = train_sampler, collate_fn = pad_collate)
valid_dl = DataLoader(valid_ds, batch_size = 128, sampler = valid_sampler, collate_fn = pad_collate)

In [29]:
class AWD_LSTM1(nn.Module):
    "AWD-LSTM from fastai."
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs,self.emb_sz,self.n_hid,self.n_layers,self.pad_token = 1,emb_sz,n_hid,n_layers,pad_token
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1,
                             batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input):
        bs,sl = input.size()
        mask = (input == self.pad_token)
        lengths = sl - mask.long().sum(1)
        n_empty = (lengths == 0).sum()
        if n_empty > 0:
            input = input[:-n_empty]
            lengths = lengths[:-n_empty]
            self.hidden = [(h[0][:,:input.size(0)], h[1][:,:input.size(0)]) for h in self.hidden]
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output = pack_padded_sequence(raw_output, lengths, batch_first=True)
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            raw_output = pad_packed_sequence(raw_output, batch_first=True)[0]
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
            new_hidden.append(new_h)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs, mask
    
    
    def _one_hidden(self, l):
        "Return one hidden state."
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

In [17]:
d= iter(train_dl)

In [18]:
type(d)

torch.utils.data.dataloader._SingleProcessDataLoaderIter

In [38]:
class Pooling(nn.Module):
    def forward(self, input):
        raw_outputs,outputs,mask = input
        output = outputs[-1]
        lengths = output.size(1) - mask.long().sum(dim=1)
        avg_pool = output.masked_fill(mask[:,:,None], 0).sum(dim=1)
        avg_pool.div_(lengths.type(avg_pool.dtype)[:,None])
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x = torch.cat([output[torch.arange(0, output.size(0)),lengths-1], max_pool, avg_pool], 1) #Concat pooling.
        return output,x

In [37]:
def bn_drop_lin(n_in, n_out, bn=True, p=0., actn=None):
    layers = [nn.BatchNorm1d(n_in)] if bn else []
    if p != 0: layers.append(nn.Dropout(p))
    layers.append(nn.Linear(n_in, n_out))
    if actn is not None: layers.append(actn)
    return layers

In [39]:
class PoolingLinearClassifier(nn.Module):
    "Create a linear classifier with pooling."

    def __init__(self, layers, drops):
        super().__init__()
        mod_layers = []
        activs = [nn.ReLU(inplace=True)] * (len(layers) - 2) + [None]
        for n_in, n_out, p, actn in zip(layers[:-1], layers[1:], drops, activs):
            mod_layers += bn_drop_lin(n_in, n_out, p=p, actn=actn)
        self.layers = nn.Sequential(*mod_layers)
       

    def forward(self, input):
        raw_outputs,outputs,mask = input
        output = outputs[-1]
        lengths = output.size(1) - mask.long().sum(dim=1)
        avg_pool = output.masked_fill(mask[:,:,None], 0).sum(dim=1)
        avg_pool.div_(lengths.type(avg_pool.dtype)[:,None])
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x = torch.cat([output[torch.arange(0, output.size(0)),lengths-1], max_pool, avg_pool], 1) #Concat pooling.
        x = self.layers(x)
        return x

In [33]:
def pad_tensor(t, bs, val=0.):
    if t.size(0) < bs:
        return torch.cat([t, val + t.new_zeros(bs-t.size(0), *t.shape[1:])])
    return t

In [34]:
class SentenceEncoder(nn.Module):
    def __init__(self, module, bptt, pad_idx=0):
        super().__init__()
        self.bptt,self.module,self.pad_idx = bptt,module,pad_idx

    def concat(self, arrs, bs):
        return [torch.cat([pad_tensor(l[si],bs) for l in arrs], dim=1) for si in range(len(arrs[0]))]
    
    def forward(self, input):
        bs,sl = input.size()
        self.module.bs = bs
        self.module.reset()
        raw_outputs,outputs,masks = [],[],[]
        for i in range(0, sl, self.bptt):
            r,o,m = self.module(input[:,i: min(i+self.bptt, sl)])
            masks.append(pad_tensor(m, bs, 1))
            raw_outputs.append(r)
            outputs.append(o)
        return self.concat(raw_outputs, bs),self.concat(outputs, bs),torch.cat(masks,dim=1)

In [35]:
def get_text_classifier(vocab_sz, emb_sz, n_hid, n_layers, n_out, pad_token, bptt, output_p=0.4, hidden_p=0.2, 
                        input_p=0.6, embed_p=0.1, weight_p=0.5, layers=None, drops=None):
    "To create a full AWD-LSTM"
    rnn_enc = AWD_LSTM1(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,
                        hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = SentenceEncoder(rnn_enc, bptt)
    if layers is None: 
        layers = [50]
    if drops is None:  
        drops = [0.1] * len(layers)
    layers = [3 * emb_sz] + layers + [n_out] 
    drops = [output_p] + drops
    return SequentialRNN(enc, PoolingLinearClassifier(layers, drops))

In [89]:
emb_sz = 300
nh = 300 
nl = 2
bptt = 70
dps = tensor([0.4, 0.3, 0.4, 0.05, 0.5]) * 0.25
model = get_text_classifier(len(vocab), emb_sz, nh, nl, len(category_encoder), vocab['xxxpad'], bptt, *dps)

Sequential(
  (0): BatchNorm1d(900, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Dropout(p=0.10000000149011612, inplace=False)
  (2): Linear(in_features=900, out_features=50, bias=True)
  (3): ReLU(inplace=True)
  (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Dropout(p=0.1, inplace=False)
  (6): Linear(in_features=50, out_features=1013, bias=True)
)


In [32]:
model[0].module.load_state_dict(torch.load(base_path/'pre_enc.pth'))

<All keys matched successfully>

In [48]:

def fit_classification(model, train_dl, valid_dl, loss_func, opt, num_epochs, acc_func):
    model = model.cuda()
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        valid_acc = 0
        for xb, yb in train_dl:
            model.train()
           
            pred = model(xb.cuda())
            loss = loss_func(pred, yb.cuda())
            
            train_loss += loss
            loss.backward()
            opt.step()
            
            #gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            opt.zero_grad()
            
        model.eval()
        with torch.no_grad():
            for xb, yb in valid_dl:
                pred = model(xb.cuda())
                loss = loss_func(pred, yb.cuda())
                valid_loss += loss
                acc = acc_func(pred, yb.cuda())
                valid_acc += acc
               
               
                
        print("Epoch {0} complete. Train loss: {1}. Valid loss {2}. Valid Accuracy {3}".format(epoch, train_loss / len(train_dl), valid_loss / len(valid_dl), valid_acc / len(valid_dl)))
       

In [52]:
emb_sz = 300
nh = 300 
nl = 2
bptt = 70
dps = tensor([0.4, 0.3, 0.4, 0.05, 0.5]) * 0.25
model = get_text_classifier(len(vocab), emb_sz, nh, nl, len(category_encoder), vocab['xxxpad'], bptt, *dps)
model[0].module.load_state_dict(torch.load(base_path/'pre_enc.pth'))

for p in model[0].parameters(): 
    p.requires_grad_(False)
fit_classification(model, train_dl, valid_dl, F.cross_entropy, torch.optim.Adam(model.parameters(), lr = 2e-2), 3, accuracy)

Epoch 0 complete. Train loss: 4.723647594451904. Valid loss 4.128415584564209. Valid Accuracy 0.16313353180885315
Epoch 1 complete. Train loss: 4.105393409729004. Valid loss 3.885662078857422. Valid Accuracy 0.20418545603752136
Epoch 2 complete. Train loss: 3.9674201011657715. Valid loss 3.8172800540924072. Valid Accuracy 0.2184767723083496


In [53]:
torch.save(model.state_dict(), base_path/'classifier_1.pt')

In [59]:
model.load_state_dict(torch.load(base_path/'classifier_1.pt'))

<All keys matched successfully>

In [60]:
for p in model[0].module.rnns[-1].parameters(): p.requires_grad_(True)
fit_classification(model, train_dl, valid_dl, F.cross_entropy, torch.optim.Adam(model.parameters(), lr = 2e-2), 2, accuracy)

Epoch 0 complete. Train loss: 3.8677234649658203. Valid loss 3.708519458770752. Valid Accuracy 0.24500855803489685
Epoch 1 complete. Train loss: 3.5215775966644287. Valid loss 3.542172431945801. Valid Accuracy 0.2768203616142273


In [61]:
torch.save(model.state_dict(), base_path/'classifier_2.pt')

In [62]:
for p in model[0].parameters(): p.requires_grad_(True)
fit_classification(model, train_dl, valid_dl, F.cross_entropy, torch.optim.Adam(model.parameters(), lr = 1e-2), 2, accuracy)

Epoch 0 complete. Train loss: 4.2987470626831055. Valid loss 3.8495402336120605. Valid Accuracy 0.26598700881004333
Epoch 1 complete. Train loss: 3.706559896469116. Valid loss 3.43371844291687. Valid Accuracy 0.34745559096336365


In [63]:
torch.save(model.state_dict(), base_path/'classifier_3.pt')

In [4]:
t.cuda()

tensor([[ 1.2319e-03,  4.5630e-41,  1.2211e-03,  4.5630e-41,  1.2049e-03],
        [ 4.5630e-41, -6.7574e-16,  4.5629e-41, -6.8373e-16,  4.5629e-41],
        [ 2.3710e-04,  4.5630e-41,  2.1992e-04,  4.5630e-41, -6.8557e-16],
        [ 4.5629e-41, -6.7883e-16,  4.5629e-41, -6.7886e-16,  4.5629e-41],
        [ 2.0539e-04,  4.5630e-41,  1.4515e-05,  4.5630e-41,  1.2048e-03]],
       device='cuda:0')