In [1]:
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, dataset
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from data_loading_code import preprocess_pandas
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import time


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device is", device)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


device is cpu


In [2]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [3]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator       
import copy     # pre-process

data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)   

train_iter, val_iter, test_iter = \
              np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])

train_iter2 = copy.deepcopy(train_iter)

tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter2["Sentence"]), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

#nltk.download('punkt')
#nltk.download('stopwords')
# get data, pre-process and split              

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data2 = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    print(data2)
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data2)))


train_data = data_process(train_iter["Sentence"])
val_data = data_process(val_iter["Sentence"])
test_data = data_process(test_iter["Sentence"])


def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape ``[seq_len, batch_size]``
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

#training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
#    data['Sentence'].values.astype('U'),
#    data['Class'].values.astype('int32'),
#    test_size=0.10,
#    random_state=0,
#    shuffle=True
#)
#
## vectorize data using TFIDF and transform for PyTorch for scalability
#word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
#training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
#training_data = training_data.todense()                             # convert to dense matrix for Pytorch
#vocab_size = len(word_vectorizer.vocabulary_)
#validation_data = word_vectorizer.transform(validation_data)
#validation_data = validation_data.todense()
#
#train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
#train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
#validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
#validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()
#
#class TransfromerDataset(torch.utils.data.Dataset):
#    def __init__(self, datasetA, bptt):
#        self.source = datasetA
#        self.bptt = bptt
#
#    def __getitem__(self, i):
#        seq_len = min(self.bptt, len(self.source) - 1 - i)
#        data = self.source[i:i+seq_len]
#        target = self.source[i+1:i+1+seq_len].reshape(-1)
#        return data, target
#
#    def __len__(self):
#        return min(len(self.datasetA))
#    
#train_ds = ConcatDataset(train_x_tensor,train_y_tensor)
#val_ds = ConcatDataset(validation_x_tensor,validation_y_tensor)
#train_loader = DataLoader(train_ds,batch_size=5)
#val_loader = DataLoader(val_ds,batch_size=5)

[tensor([1280,  194,    9,  167,   16,  243,    2,  551,    3,  254,   16,   10,
          26,   68,   12]), tensor([   2, 1303,    7,  566,  435,    4,  182,    5,  106, 1295,    6,  419,
           1]), tensor([  3,  61, 140,  48,   8,  29,   1]), tensor([   8,   59,   18,  121,    2,  333, 1315,  412,    4,    5,  170,    2,
         120,   89,    3,  187,    9,  126,    5,   24,    1]), tensor([  3, 140,  48,   8, 117,   9, 816,  12]), tensor([   5,   30,  119,   21,   14,  124,  674,    6,    4,   10,  209,   51,
          47,    2, 1365,   31,  138,    1]), tensor([  73,   11,   25,  123,    2,  162,  327,   22,    3,  138,    4, 1102,
          73,   11,   25,   72,    8,   13,    1]), tensor([   3,   92,    2,  606,   49,   16,   14,   94,  333, 1335,    4,    5,
          78,   60, 1346,   46, 1127,   14,  892,  257,   38,  155,    1]), tensor([147,  29,   6,   4, 130,  15,  93,  12]), tensor([ 23,  26, 542,  16,   2,  68,   1]), tensor([365,   5,  30,  62, 365,   5,   7, 104,

  return bound(*args, **kwds)


In [4]:
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target


In [5]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)



In [6]:
criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 2
    start_time = time.time()

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        output = model(data)
        output_flat = output.view(-1, ntokens)
        loss = criterion(output_flat, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += seq_len * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [7]:
best_val_loss = float('inf')
epochs = 50

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss = evaluate(model, val_data)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

| epoch   1 |     2/   10 batches | lr 5.00 | ms/batch 109.16 | loss 11.71 | ppl 121418.68
| epoch   1 |     4/   10 batches | lr 5.00 | ms/batch 54.07 | loss  7.75 | ppl  2332.66
| epoch   1 |     6/   10 batches | lr 5.00 | ms/batch 49.29 | loss  8.72 | ppl  6125.85
| epoch   1 |     8/   10 batches | lr 5.00 | ms/batch 51.27 | loss  9.18 | ppl  9662.39
| epoch   1 |    10/   10 batches | lr 5.00 | ms/batch 40.40 | loss  7.01 | ppl  1107.95
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.68s | valid loss  7.89 | valid ppl  2659.11
-----------------------------------------------------------------------------------------
| epoch   2 |     2/   10 batches | lr 4.75 | ms/batch 81.27 | loss 10.54 | ppl 37852.62
| epoch   2 |     4/   10 batches | lr 4.75 | ms/batch 56.30 | loss 10.26 | ppl 28487.18
| epoch   2 |     6/   10 batches | lr 4.75 | ms/batch 62.92 | loss  7.04 | ppl  1136.87
| epoch   2 |     8/   10 batche

In [11]:
inp = "I like this phone"
tokens = tokenizer(inp)
embed = torch.tensor([vocab(tokens)], dtype=torch.long)
print(inp)
print(tokens)
print(embed)
output = model(embed)
output_flat = output.view(-1, ntokens)
print(output_flat.shape)
print(output_flat)


I like this phone
['i', 'like', 'this', 'phone']
tensor([[ 3, 56,  8, 13]])
torch.Size([4, 1370])
tensor([[ 5.1994e-02,  3.5826e+00,  2.0664e+00,  ..., -9.8716e-01,
         -1.2189e+00, -1.0230e+00],
        [-2.9615e-02,  6.0346e+00,  5.3082e+00,  ..., -2.1042e-01,
          4.9607e-01, -3.8053e-01],
        [-1.6299e+00,  5.5731e+00,  1.5503e+00,  ..., -1.3320e-01,
          4.0497e-01, -1.2075e+00],
        [-6.0003e-01,  7.6937e+00,  3.6829e+00,  ...,  6.0129e-03,
          7.4662e-01, -5.7282e-01]], grad_fn=<ViewBackward0>)


In [12]:
ma = torch.argmax(output_flat,dim=1)
print(ma)
print(ma[0])
print(vocab.lookup_tokens(ma.tolist()))

tensor([11,  5,  7,  4])
tensor(11)
["'", 'it', 'is', 'and']


In [96]:
data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

print(training_data)
print(type(training_data[0]))

# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

['this allows the possibility of double booking for the same date and time after the first.'
 'my sister has one also and she loves it.'
 "the one big drawback of the mp player is that the buttons on the phone's front cover that let you pause and skip songs lock out after a few seconds."
 'the cutouts and buttons are placed perfectly.'
 'this is definitely a must have if your state does not allow cell phone usage while driving.'
 'these are fabulous!' 'nice sound.'
 "i can't use this case because the smell is disgusting."
 'i really like this product over the motorola because it is allot clearer on the ear piece and the mic.'
 'fast service.' 'i found this product to be waaay too big.'
 "it plays louder than any other speaker of this size; the price is so low that most would think the quality is lacking, however, it's not."
 'no buyers remorse on this one!.'
 'i had to go to a store and bought a new nokia phone which is working great.'
 'poor quality and service.'
 'the worst piece of 

In [97]:
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, datasetA, datasetB):
        self.datasetA = datasetA
        self.datasetB = datasetB

    def __getitem__(self, i):
        inp = self.datasetA[i]
        tokens = tokenizer(inp)
        embed = torch.tensor(vocab(tokens), dtype=torch.long)
        label = F.one_hot(self.datasetB[i],num_classes=2)
        output = model(embed)
        output_flat = output.view(-1, ntokens)
        return output_flat,label

    def __len__(self):
        return min(len(self.datasetA),len(self.datasetB))

In [98]:
train_ds = ConcatDataset(train_x_tensor,train_y_tensor)
val_ds = ConcatDataset(validation_x_tensor,validation_y_tensor)
train_loader = DataLoader(train_ds,batch_size=5)
val_loader = DataLoader(val_ds,batch_size=5)

In [99]:
class ANN(nn.Module):
    def __init__(self,num_classes):
        super().__init__()
        
        self.act = nn.LeakyReLU()

        # input: 7277        
        self.fc1 = nn.Linear(in_features=7277, out_features=1000) 
        self.fc2 = nn.Linear(in_features=1000, out_features=100)
        self.fc3 = nn.Linear(in_features=100, out_features=10)
        self.fc4 = nn.Linear(in_features=10, out_features=2)
        self.logSoftmax = nn.Softmax(dim=1)


    
    def forward(self, x):
        #FC Layer 1
        x = self.fc1(x)
        x = self.act(x)

        #FC Layer 2
        x = self.fc2(x)   
        x = self.act(x)    

        #FC Layer 3
        x = self.fc3(x)   
        x = self.act(x)    

        #FC Layer 4
        x = self.fc4(x)    
        
        #Softmax
        out = self.logSoftmax(x)

        return out

In [100]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs):
    best_val_loss = float('inf')
    best_model = model

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1} of {num_epochs}")
        model.train()
        train_loss = 0.0
        for batch_nr, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device).to(torch.float)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss = loss.to(device)

            if (batch_nr%20 == 0):
                print(f"Processing batch number {batch_nr+1} of {len(train_loader)}")
                print("current loss",loss.item())
                
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        
        train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).to(torch.float)
            outputs = model(inputs)
        
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    
    val_loss /= len(val_loader.dataset)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    return best_model

In [101]:
LEARNING_RATE = 0.0001
EPOCHS = 10

print(device)
model2 = ANN(num_classes=2).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

optimizer = torch.optim.Adam(model2.parameters(), lr=LEARNING_RATE)

# Train the model
trained_model = train_model(model2, criterion, optimizer, train_loader, val_loader, EPOCHS)

cpu
Starting epoch 1 of 10


AttributeError: 'Tensor' object has no attribute 'lower'

In [123]:
len(vocab)

1370