In [92]:
import torch
from torch import nn, Tensor
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from data_loading_code import preprocess_pandas
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
import math


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device is", device)

device is cpu


In [93]:
#nltk.download('punkt')
#nltk.download('stopwords')
# get data, pre-process and split
data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

print(training_data)
print(type(training_data[0]))

# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.LongTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.LongTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

['this allows the possibility of double booking for the same date and time after the first.'
 'my sister has one also and she loves it.'
 "the one big drawback of the mp player is that the buttons on the phone's front cover that let you pause and skip songs lock out after a few seconds."
 'the cutouts and buttons are placed perfectly.'
 'this is definitely a must have if your state does not allow cell phone usage while driving.'
 'these are fabulous!' 'nice sound.'
 "i can't use this case because the smell is disgusting."
 'i really like this product over the motorola because it is allot clearer on the ear piece and the mic.'
 'fast service.' 'i found this product to be waaay too big.'
 "it plays louder than any other speaker of this size; the price is so low that most would think the quality is lacking, however, it's not."
 'no buyers remorse on this one!.'
 'i had to go to a store and bought a new nokia phone which is working great.'
 'poor quality and service.'
 'the worst piece of 

In [94]:
print(train_x_tensor.shape)

torch.Size([900, 7277])


In [95]:
train_x_tensor[1].shape

torch.Size([7277])

In [96]:
print(data.iloc(0))

<pandas.core.indexing._iLocIndexer object at 0x0000023072333E80>


In [97]:
dl = DataLoader(train_y_tensor, batch_size=5,num_workers=2)
it = iter(dl)
print(next(it))

tensor([0, 1, 0, 1, 1])


In [98]:
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, datasetA, datasetB):
        self.datasetA = datasetA
        self.datasetB = datasetB

    def __getitem__(self, i):
        inp = self.datasetA[i]
        label = F.one_hot(self.datasetB[i],num_classes=2)
        return inp,label.type(torch.LongTensor)

    def __len__(self):
        return min(len(self.datasetA),len(self.datasetB))

In [99]:
train_ds = ConcatDataset(train_x_tensor,train_y_tensor)
val_ds = ConcatDataset(validation_x_tensor,validation_y_tensor)
train_loader = DataLoader(train_ds,batch_size=5)
val_loader = DataLoader(val_ds,batch_size=5)

In [105]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.Softmax = nn.Softmax(dim=1)
        self.linear1 = nn.Linear(d_model*vocab_size, 100)
        self.linear2 = nn.Linear(100, 10)
        self.linear = nn.Linear(10, 2)
        

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
        output = self.transformer_encoder(src, src_mask)
        output = torch.flatten(output, 1)
        output = self.linear1(output)
        output = self.linear2(output)
        output = self.linear(output)
        output = self.Softmax(output)
        return output

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [101]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs):
    best_val_loss = float('inf')
    best_model = model

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1} of {num_epochs}")
        model.train()
        train_loss = 0.0
        for batch_nr, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device).type(torch.LongTensor)
            labels = labels.to(device).to(torch.float)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss = loss.to(device)

            if (batch_nr%5 == 0):
                
                print(labels)
                print(outputs)
                print(f"Processing batch number {batch_nr+1} of {len(train_loader)}")
                print("current loss",loss.item())
                
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        
        train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device).type(torch.LongTensor)
            labels = labels.to(device).to(torch.float)
            outputs = model(inputs)
        
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    
    val_loss /= len(val_loader.dataset)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    return best_model

In [106]:
LEARNING_RATE = 0.0001
EPOCHS = 3

print(device)
ntokens = vocab_size  # size of vocabulary
emsize = 100  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.05  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Train the model
trained_model = train_model(model, criterion, optimizer, train_loader, val_loader, EPOCHS)

cpu




AttributeError: 'TransformerModel' object has no attribute 'linear'

In [None]:
word_vectorizer.vocabulary_

{'this': 6144,
 'allows': 149,
 'the': 5816,
 'possibility': 4720,
 'of': 4137,
 'double': 1660,
 'booking': 815,
 'for': 2146,
 'same': 5161,
 'date': 1456,
 'and': 229,
 'time': 6241,
 'after': 81,
 'first': 2078,
 'this allows': 6146,
 'allows the': 151,
 'the possibility': 5982,
 'possibility of': 4721,
 'of double': 4148,
 'double booking': 1661,
 'booking for': 816,
 'for the': 2196,
 'the same': 6005,
 'same date': 5163,
 'date and': 1457,
 'and time': 368,
 'time after': 6242,
 'after the': 95,
 'the first': 5890,
 'my': 3829,
 'sister': 5367,
 'has': 2559,
 'one': 4256,
 'also': 163,
 'she': 5300,
 'loves': 3574,
 'it': 3094,
 'my sister': 3898,
 'sister has': 5368,
 'has one': 2573,
 'one also': 4257,
 'also and': 164,
 'and she': 350,
 'she loves': 5304,
 'loves it': 3575,
 'big': 771,
 'drawback': 1679,
 'mp': 3801,
 'player': 4659,
 'is': 2975,
 'that': 5757,
 'buttons': 929,
 'on': 4220,
 'phone': 4527,
 'front': 2279,
 'cover': 1408,
 'let': 3405,
 'you': 7222,
 'pause':

In [None]:
word_tokenize("Hello my name is grape")

['Hello', 'my', 'name', 'is', 'grape']

In [None]:
while True:
    inp = input("Leave a review:")
    if inp=="exit":
        break
    tdata = word_vectorizer.transform([" ".join(word_tokenize(inp))])
    tdata = torch.Tensor(tdata.todense())
    tdata = torch.from_numpy(np.array(tdata)).type(torch.FloatTensor)
    print(tdata.shape)
    pred = model(tdata).tolist()
    if(pred[0][0]>0.5):
        print("You seem to dislike this thing")
    else:
        print("You seem to like this thing")

torch.Size([1, 7277])
You seem to dislike this thing
torch.Size([1, 7277])
You seem to like this thing
