In [13]:
!pip install torchbearer
!pip install progress



In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
import numpy as np
import time
import datetime
from torch import nn
import sys
import os
from progress.bar import IncrementalBar
from tqdm import tqdm
import torchtext.vocab
from torchtext import data
import pandas as pd

In [0]:
from torchtext.data import Field, Dataset, Example
import pandas as pd

class DataFrameDataset(Dataset):
    """Class for using pandas DataFrames as a datasource"""

    def __init__(self, examples, fields, filter_pred=None):
        """
        Create a dataset from a pandas dataframe of examples and Fields
        Arguments:
            examples pd.DataFrame: DataFrame of examples
            fields {str: Field}: The Fields to use in this tuple. The
                string is a field name, and the Field is the associated field.
            filter_pred (callable or None): use only exanples for which
                filter_pred(example) is true, or use all examples if None.
                Default is None
        """
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)
        # Unpack field tuples
        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]


class SeriesExample(Example):
    """Class to convert a pandas Series to an Example"""

    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)

    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()

        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                                 "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])


        return ex

def three_class_problem(df):
  
  df = df[df['overall'] != 2]
  df = df[df['overall'] != 4]
  df.loc[df['overall'] == 1, 'overall'] = 0
  df.loc[df['overall'] == 3, 'overall'] = 1
  df.loc[df['overall'] == 5, 'overall'] = 2

  return df



# def create_iterator(train_data, valid_data, test_data, batch_size, device):
#     #  BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
#     # by setting sort_within_batch = True.
#     train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
#         batch_size = batch_size,
#         sort_key = lambda x: len(x.reviewText), # Sort the batches by text length size
#         sort_within_batch = True,
#         device = device)
#     return train_iterator, valid_iterator, test_iterator

In [16]:
from google.colab import drive
drive.mount('/content/drive')
root_path = "/content/drive/My Drive/notebooks"
# For Google colab only

df = pd.read_csv(f"{root_path}/new_clean_sm_100000.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]
df = three_class_problem(df)
#df["overall"] = df["overall"].apply(lambda x: x - 1)
train_df, validate_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
class MLP(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size2, hidden_size3, hidden_size4, output_dim, dropout, max_document_length):
        super().__init__()

        # embedding and convolution layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(embed_size*max_document_length, hidden_size2)  # dense layer
        self.fc2 = nn.Linear(hidden_size2, hidden_size3)  # dense layer
        self.fc3 = nn.Linear(hidden_size3, hidden_size4)  # dense layer
        self.fc4 = nn.Linear(hidden_size4, output_dim)  # dense layer

    def forward(self, text):
        # text shape = (batch_size, num_sequences)
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]

        x = embedded.view(embedded.shape[0], -1)  # x = Flatten()(x)
        #embedded = embedded.unsqueeze(1) # fc gets 4 dimension

        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        preds = self.fc4(x)
        #preds = F.softmax(preds, 1)
        #labels = torch.max(preds, 1)
        #print(f"preds is {preds}")
        return preds

In [0]:
#df, num_classes = three_class_problem(df)
max_document_length = 100  # each sentence has until 100 words
max_size = 5000 # maximum vocabulary size

Text = data.Field(tokenize='spacy', batch_first=True, include_lengths=True, fix_length=max_document_length) # fix_length - make the sentences padded in the same lengths for all the batches
Label = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)
fields = { 'overall' : Label, 'reviewText' : Text }
train_ds = DataFrameDataset(train_df, fields)
test_ds = DataFrameDataset(test_df, fields)
valid_ds = DataFrameDataset(validate_df, fields)

Text.build_vocab(train_ds, max_size=max_size)
Label.build_vocab(train_ds)
vocab_size = len(Text.vocab)

In [25]:
def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type, device):
    best_valid_loss = float('inf')
    model = model.to(device)

    for epoch in range(epochs):

        # train the model
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc, valid_f1 = evaluate(model, valid_iterator, criterion)


        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            #torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
        print(f'\t F1 score is {valid_f1}')



def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        # retrieve text and no. of words

        text, text_lengths = batch.reviewText
        text, text_lengths =  text.to(device), text_lengths.to(device)

        predictions = model(text)
        loss = criterion(predictions, batch.overall)

        acc = accuracy(predictions, batch.overall)

        # perform backpropagation
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.reviewText

            predictions = model(text).squeeze(1)

            loss = criterion(predictions, batch.overall)

            acc = accuracy(predictions, batch.overall)
            f1 = f1_loss(predictions.argmax(dim=1), batch.overall)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1/len(iterator)


def accuracy(probs, target):
  winners = probs.argmax(dim=1)
  corrects = (winners == target)
  accuracy = corrects.sum().float() / float(target.size(0))
  return accuracy

100%|██████████| 3596/3596 [00:23<00:00, 154.20it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 1.042 | Train Acc: 42.44%
	 Val. Loss: 0.906 |  Val. Acc: 53.77%
	 F1 score is 1.3534559601788922


100%|██████████| 3596/3596 [00:22<00:00, 161.60it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 0.764 | Train Acc: 64.17%
	 Val. Loss: 0.751 |  Val. Acc: 66.17%
	 F1 score is 1.371495495049331


100%|██████████| 3596/3596 [00:22<00:00, 160.38it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 0.540 | Train Acc: 76.96%
	 Val. Loss: 0.737 |  Val. Acc: 69.83%
	 F1 score is 1.4220948796952337


100%|██████████| 3596/3596 [00:22<00:00, 158.48it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 0.378 | Train Acc: 84.15%
	 Val. Loss: 0.949 |  Val. Acc: 69.64%
	 F1 score is 1.4372555155670572


100%|██████████| 3596/3596 [00:22<00:00, 159.33it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 0.332 | Train Acc: 86.26%
	 Val. Loss: 1.061 |  Val. Acc: 69.95%
	 F1 score is 1.4407364873711122


100%|██████████| 3596/3596 [00:22<00:00, 158.93it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 0.303 | Train Acc: 87.60%
	 Val. Loss: 1.138 |  Val. Acc: 71.05%
	 F1 score is 1.4393183846986721


100%|██████████| 3596/3596 [00:22<00:00, 159.02it/s]
  0%|          | 0/3596 [00:00<?, ?it/s]

	Train Loss: 0.281 | Train Acc: 88.50%
	 Val. Loss: 1.204 |  Val. Acc: 70.61%
	 F1 score is 1.4514214821265476


100%|██████████| 3596/3596 [00:22<00:00, 160.11it/s]


	Train Loss: 0.262 | Train Acc: 89.34%
	 Val. Loss: 1.306 |  Val. Acc: 70.03%
	 F1 score is 1.4607383524804836


In [0]:
def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return f1

In [0]:
import torchbearer
from torchbearer import Trial
from torch import optim

hidden_size1 = 256
hidden_size2 = 128
hidden_size3 = 64
batch_size = 50
dropout_keep_prob = 0.5
embedding_size = 300
to_train = True
device = "cuda:0" if torch.cuda.is_available() else "cpu"

class MyIter:
    def __init__(self, it):
        self.it = it
    def __iter__(self):
        for batch in self.it:
            yield (batch.reviewText, batch.overall.unsqueeze(1))
    def __len__(self):
        return len(self.it)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_ds, valid_ds, test_ds),
    batch_size=batch_size,
    device=device,
    sort_key=lambda x: len(x.reviewText),
    sort_within_batch=True)

mlp_model = MLP(vocab_size, embedding_size, hidden_size1, hidden_size2, hidden_size3,  3, dropout_keep_prob, max_document_length)
#train_iterator, valid_iterator, test_iterator = create_iterator(train_ds, valid_ds, test_ds, batch_size, device)


# define the loss function and the optimiser
loss_function = nn.CrossEntropyLoss()
optimiser = optim.Adam(mlp_model.parameters(), lr=1e-4)
run_train(8, mlp_model, train_iterator, valid_iterator, optimiser, loss_function, "MLP", device)
# torchbearer_trial = Trial(mlp_model, optimiser, loss_function, metrics=['acc', 'loss']).to(device)
# torchbearer_trial.with_generators(train_generator=MyIter(train_iterator), val_generator=MyIter(valid_iterator), test_generator=MyIter(test_iterator))
# torchbearer_trial.run(epochs=5)
# torchbearer_trial.predict()

