In [40]:
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, dataset
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from data_loading_code import preprocess_pandas
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import time


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device is", device)

device is cpu


In [41]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [42]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator       
import copy     # pre-process

data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)   

train_iter, val_iter, test_iter = \
              np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])

train_iter2 = copy.deepcopy(train_iter)

tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter2["Sentence"]), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

#nltk.download('punkt')
#nltk.download('stopwords')
# get data, pre-process and split              

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data2 = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    print(data2)
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data2)))


train_data = data_process(train_iter["Sentence"])
val_data = data_process(val_iter["Sentence"])
test_data = data_process(test_iter["Sentence"])


def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 40
eval_batch_size = 40
train_data = batchify(train_data, batch_size)  # shape ``[seq_len, batch_size]``
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

#training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
#    data['Sentence'].values.astype('U'),
#    data['Class'].values.astype('int32'),
#    test_size=0.10,
#    random_state=0,
#    shuffle=True
#)
#
## vectorize data using TFIDF and transform for PyTorch for scalability
#word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
#training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
#training_data = training_data.todense()                             # convert to dense matrix for Pytorch
#vocab_size = len(word_vectorizer.vocabulary_)
#validation_data = word_vectorizer.transform(validation_data)
#validation_data = validation_data.todense()
#
#train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
#train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
#validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
#validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()
#
#class TransfromerDataset(torch.utils.data.Dataset):
#    def __init__(self, datasetA, bptt):
#        self.source = datasetA
#        self.bptt = bptt
#
#    def __getitem__(self, i):
#        seq_len = min(self.bptt, len(self.source) - 1 - i)
#        data = self.source[i:i+seq_len]
#        target = self.source[i+1:i+1+seq_len].reshape(-1)
#        return data, target
#
#    def __len__(self):
#        return min(len(self.datasetA))
#    
#train_ds = ConcatDataset(train_x_tensor,train_y_tensor)
#val_ds = ConcatDataset(validation_x_tensor,validation_y_tensor)
#train_loader = DataLoader(train_ds,batch_size=5)
#val_loader = DataLoader(val_ds,batch_size=5)

[tensor([1280,  194,    9,  167,   16,  243,    2,  551,    3,  254,   16,   10,
          26,   68,   12]), tensor([   2, 1303,    7,  566,  435,    4,  182,    5,  106, 1295,    6,  419,
           1]), tensor([  3,  61, 140,  48,   8,  29,   1]), tensor([   8,   59,   18,  121,    2,  333, 1315,  412,    4,    5,  170,    2,
         120,   89,    3,  187,    9,  126,    5,   24,    1]), tensor([  3, 140,  48,   8, 117,   9, 816,  12]), tensor([   5,   30,  119,   21,   14,  124,  674,    6,    4,   10,  209,   51,
          47,    2, 1365,   31,  138,    1]), tensor([  73,   11,   25,  123,    2,  162,  327,   22,    3,  138,    4, 1102,
          73,   11,   25,   72,    8,   13,    1]), tensor([   3,   92,    2,  606,   49,   16,   14,   94,  333, 1335,    4,    5,
          78,   60, 1346,   46, 1127,   14,  892,  257,   38,  155,    1]), tensor([147,  29,   6,   4, 130,  15,  93,  12]), tensor([ 23,  26, 542,  16,   2,  68,   1]), tensor([365,   5,  30,  62, 365,   5,   7, 104,

  return bound(*args, **kwds)


In [43]:
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target


In [44]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.5  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)



In [45]:
criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 2
    start_time = time.time()

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        output = model(data)
        output_flat = output.view(-1, ntokens)
        loss = criterion(output_flat, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += seq_len * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [46]:
best_val_loss = float('inf')
epochs = 70

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss = evaluate(model, val_data)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

| epoch   1 |     2/    5 batches | lr 5.00 | ms/batch 230.23 | loss 11.57 | ppl 106252.98
| epoch   1 |     4/    5 batches | lr 5.00 | ms/batch 155.49 | loss  7.47 | ppl  1753.25
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.89s | valid loss  7.90 | valid ppl  2688.07
-----------------------------------------------------------------------------------------
| epoch   2 |     2/    5 batches | lr 4.75 | ms/batch 233.65 | loss 10.54 | ppl 37723.93
| epoch   2 |     4/    5 batches | lr 4.75 | ms/batch 158.36 | loss  6.59 | ppl   730.40
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.90s | valid loss  6.89 | valid ppl   982.01
-----------------------------------------------------------------------------------------
| epoch   3 |     2/    5 batches | lr 4.51 | ms/batch 221.01 | loss 10.55 | ppl 38148.53
| epoch   3 |     4/    5 batches | lr 4.

In [47]:
inp = "I like this phone"
tokens = tokenizer(inp)
embed = torch.tensor([vocab(tokens)], dtype=torch.long)
print(inp)
print(tokens)
print(embed)
print(embed.shape)
print(embed.dtype)
print(type(embed))
output = model(embed)
print(output.shape)
output_flat = output.view(-1, ntokens)
print(output_flat.shape)
print(output_flat)


I like this phone
['i', 'like', 'this', 'phone']
tensor([[ 3, 56,  8, 13]])
torch.Size([1, 4])
torch.int64
<class 'torch.Tensor'>
torch.Size([1, 4, 1370])
torch.Size([4, 1370])
tensor([[-0.6971,  1.9899,  1.9780,  ...,  0.0844, -0.1582, -0.0935],
        [ 0.0340,  5.3120,  7.1593,  ..., -0.4306,  0.1480, -0.5205],
        [ 0.2668,  5.0340,  2.4351,  ..., -0.0494, -1.0521, -0.9537],
        [-0.8155,  7.3042,  2.9632,  ..., -1.0899,  0.6805, -0.0595]],
       grad_fn=<ViewBackward0>)


In [85]:
class FFF(nn.Module):
    def __init__(self,vocab_sz, encoder):
        super().__init__()
        self.encoder = encoder
        self.linear1=nn.Linear(vocab_sz,100)
        self.linear2=nn.Linear(100,10)
        self.linear3=nn.Linear(10,2)
        self.sm = nn.Softmax()
    
    def forward(self,x):
        x = self.encoder(x)
        x = torch.mean(x.view(-1, ntokens),dim=0)
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.linear3(x)
        x = self.sm(x)
        return x

In [86]:
torch.Size([1, 4, 1370])

torch.Size([1, 4, 1370])

In [87]:
x = torch.rand((4,8))
print(x)
x = x[0,:]
print(x)


tensor([[0.9346, 0.3909, 0.5329, 0.2755, 0.5809, 0.2202, 0.9995, 0.4187],
        [0.2758, 0.9138, 0.3508, 0.8425, 0.5495, 0.7500, 0.1883, 0.6494],
        [0.0229, 0.4548, 0.8889, 0.8081, 0.4092, 0.6145, 0.7072, 0.9636],
        [0.5418, 0.4581, 0.7521, 0.3205, 0.6139, 0.8473, 0.0696, 0.2117]])
tensor([0.9346, 0.3909, 0.5329, 0.2755, 0.5809, 0.2202, 0.9995, 0.4187])


In [88]:
data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

print(training_data)
print(type(training_data[0]))

# vectorize data using TFIDF and transform for PyTorch for scalability
#word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
#training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
#training_data = training_data.todense()                             # convert to dense matrix for Pytorch
#vocab_size = len(word_vectorizer.vocabulary_)
#validation_data = word_vectorizer.transform(validation_data)
#validation_data = validation_data.todense()

#train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
#validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

['this allows the possibility of double booking for the same date and time after the first.'
 'my sister has one also and she loves it.'
 "the one big drawback of the mp player is that the buttons on the phone's front cover that let you pause and skip songs lock out after a few seconds."
 'the cutouts and buttons are placed perfectly.'
 'this is definitely a must have if your state does not allow cell phone usage while driving.'
 'these are fabulous!' 'nice sound.'
 "i can't use this case because the smell is disgusting."
 'i really like this product over the motorola because it is allot clearer on the ear piece and the mic.'
 'fast service.' 'i found this product to be waaay too big.'
 "it plays louder than any other speaker of this size; the price is so low that most would think the quality is lacking, however, it's not."
 'no buyers remorse on this one!.'
 'i had to go to a store and bought a new nokia phone which is working great.'
 'poor quality and service.'
 'the worst piece of 

In [89]:
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, datasetA, datasetB):
        self.datasetA = datasetA
        self.datasetB = datasetB

    def __getitem__(self, i):
        inp = self.datasetA[i]
        tokens = tokenizer(inp)
        embed = torch.tensor(vocab(tokens), dtype=torch.long)
        label = F.one_hot(self.datasetB[i],num_classes=2)
        return embed,label

    def __len__(self):
        return min(len(self.datasetA),len(self.datasetB))

In [95]:
def collate_fn_padd(batch):
    '''
    Padds batch of variable length

    note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    '''
    ## get sequence lengths
    lengths = torch.tensor([ t.shape[0] for t in batch ]).to(device)
    ## padd
    batch = [ torch.Tensor(t).to(device) for t in batch ]
    batch = torch.nn.utils.rnn.pad_sequence(batch)
    ## compute mask
    mask = (batch != 0).to(device)
    return batch, lengths, mask


train_ds = ConcatDataset(training_data,train_y_tensor)
val_ds = ConcatDataset(validation_data,validation_y_tensor)
print(len(training_data),len(train_y_tensor))
train_loader = DataLoader(train_ds,batch_size=1)
val_loader = DataLoader(val_ds,batch_size=1)

900 900


In [96]:
it = iter(train_loader)

In [97]:
next(it)

[tensor([[  8, 602,   2,   0,  15,   0,   0,  16,   2, 162, 746,   4,  89,  63,
            2, 120,   1]]),
 tensor([[1, 0]])]

In [114]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs):
    best_val_loss = float('inf')
    best_model = model

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1} of {num_epochs}")
        model.train()
        train_loss = 0.0
        for batch_nr, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device).to(torch.float)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            #print(inputs)
            #print(outputs.unsqueeze(0))
            #print(labels)
            loss = criterion(outputs.unsqueeze(0), labels)
            loss = loss.to(device)

            if (batch_nr%80 == 0):
                print(f"Processing batch number {batch_nr+1} of {len(train_loader)}")
                print(outputs)
                print(labels)
                print("current loss",loss.item())
                
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        
        train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).to(torch.float)
            outputs = model(inputs)

            loss = criterion(outputs.unsqueeze(0), labels)
            val_loss += loss.item() * inputs.size(0)
    
    val_loss /= len(val_loader.dataset)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    return best_model

In [115]:
LEARNING_RATE = 0.0001
EPOCHS = 30

model2 = FFF(len(vocab),model)

print(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

optimizer = torch.optim.Adam(model2.parameters(), lr=LEARNING_RATE)

# Train the model
trained_model = train_model(model2, criterion, optimizer, train_loader, val_loader, EPOCHS)

cpu
Starting epoch 1 of 30
Processing batch number 1 of 900
tensor([0.3510, 0.6490], grad_fn=<SoftmaxBackward0>)
tensor([[1., 0.]])
current loss 0.8532605171203613


  return self._call_impl(*args, **kwargs)


Processing batch number 81 of 900
tensor([0.2835, 0.7165], grad_fn=<SoftmaxBackward0>)
tensor([[1., 0.]])
current loss 0.9328535795211792
Processing batch number 161 of 900
tensor([0.5033, 0.4967], grad_fn=<SoftmaxBackward0>)
tensor([[1., 0.]])
current loss 0.6898790597915649
Processing batch number 241 of 900
tensor([0.6800, 0.3200], grad_fn=<SoftmaxBackward0>)
tensor([[0., 1.]])
current loss 0.889215350151062
Processing batch number 321 of 900
tensor([0.3645, 0.6355], grad_fn=<SoftmaxBackward0>)
tensor([[0., 1.]])
current loss 0.5668364763259888
Processing batch number 401 of 900
tensor([0.5506, 0.4494], grad_fn=<SoftmaxBackward0>)
tensor([[0., 1.]])
current loss 0.7450413703918457
Processing batch number 481 of 900
tensor([0.8466, 0.1534], grad_fn=<SoftmaxBackward0>)
tensor([[0., 1.]])
current loss 1.0986812114715576
Processing batch number 561 of 900
tensor([0.1867, 0.8133], grad_fn=<SoftmaxBackward0>)
tensor([[0., 1.]])
current loss 0.4281196892261505
Processing batch number 641 o

KeyboardInterrupt: 

In [118]:
while True:
    inp = input("Leave a review:")
    if inp=="exit":
        break
    tokens = tokenizer(inp)
    embed = torch.tensor(vocab(tokens), dtype=torch.long)
    pred = model2(embed).tolist()
    print(pred)
    if(pred[0]>0.5):
        print("You seem to dislike this thing")
    else:
        print("You seem to like this thing")

[9.228735831090518e-12, 1.0]
You seem to like this thing
[0.9993926286697388, 0.0006073883851058781]
You seem to dislike this thing
[nan, nan]
You seem to like this thing
[1.8667834638108616e-06, 0.9999980926513672]
You seem to like this thing
[nan, nan]
You seem to like this thing
[0.0009728256263770163, 0.9990271329879761]
You seem to like this thing
[nan, nan]
You seem to like this thing
[1.0, 1.9274128959523296e-09]
You seem to dislike this thing
[nan, nan]
You seem to like this thing
[1.0985942109362559e-13, 1.0]
You seem to like this thing
