In [0]:
!pip install torchbearer
!pip install progress



In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
import numpy as np
import time
import datetime
from torch import nn
import sys
import os
from tqdm import tqdm
import torchtext.vocab
from torchtext import data
import pandas as pd
from sklearn.metrics import f1_score

In [0]:
from torchtext.data import Field, Dataset, Example
import pandas as pd

class DataFrameDataset(Dataset):
    """Class for using pandas DataFrames as a datasource"""

    def __init__(self, examples, fields, filter_pred=None):
        """
        Create a dataset from a pandas dataframe of examples and Fields
        Arguments:
            examples pd.DataFrame: DataFrame of examples
            fields {str: Field}: The Fields to use in this tuple. The
                string is a field name, and the Field is the associated field.
            filter_pred (callable or None): use only exanples for which
                filter_pred(example) is true, or use all examples if None.
                Default is None
        """
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)
        # Unpack field tuples
        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]


class SeriesExample(Example):
    """Class to convert a pandas Series to an Example"""

    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)

    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()

        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                                 "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])


        return ex

def three_class_problem(df):
  
  df = df[df['overall'] != 2]
  df = df[df['overall'] != 4]
  df.loc[df['overall'] == 1, 'overall'] = 0
  df.loc[df['overall'] == 3, 'overall'] = 1
  df.loc[df['overall'] == 5, 'overall'] = 2

  return df



# def create_iterator(train_data, valid_data, test_data, batch_size, device):
#     #  BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
#     # by setting sort_within_batch = True.
#     train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
#         batch_size = batch_size,
#         sort_key = lambda x: len(x.reviewText), # Sort the batches by text length size
#         sort_within_batch = True,
#         device = device)
#     return train_iterator, valid_iterator, test_iterator

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
# root_path = "/content/drive/My Drive/notebooks"
# # For Google colab only

df = pd.read_csv(f"../new_clean_sm_100000.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]
df = three_class_problem(df)
#df["overall"] = df["overall"].apply(lambda x: x - 1)
train_df, validate_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# credit to https://github.com/Shawn1993/cnn-text-classification-pytorch for the TextCNN model

class CNN_Text(nn.Module):
    
    def __init__(self, embed_num, embed_dim, class_num, kernel_num,kernel_sizes):
        super(CNN_Text, self).__init__()
       
        
        V = embed_num
        D = embed_dim
        C = class_num
        Ci = 1
        Co = kernel_num
        Ks = kernel_sizes

        self.embed = nn.Embedding(V, D)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        
        #if self.args.static:
         #   x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit

In [0]:
#df, num_classes = three_class_problem(df)
max_document_length = 256  # each sentence has until 100 words
max_size = 5000 # maximum vocabulary size

Text = data.Field(tokenize='spacy', batch_first=True, include_lengths=True, fix_length=max_document_length) # fix_length - make the sentences padded in the same lengths for all the batches
Label = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)
fields = { 'overall' : Label, 'reviewText' : Text }
train_ds = DataFrameDataset(train_df, fields)
test_ds = DataFrameDataset(test_df, fields)
valid_ds = DataFrameDataset(validate_df, fields)

Text.build_vocab(train_ds, max_size=max_size, vectors="glove.6B.100d")
Label.build_vocab(train_ds)
vocab_size = len(Text.vocab)

In [0]:
def train(train_iter, dev_iter, model):
 
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in (range(1, 2+0)):
        for batch in tqdm(train_iter, total=len(train_iter)):
            #feature.data.t_(), target.data.sub_(1)  # batch first, index align

            feature, _ =  batch.reviewText
            target = batch.overall
            feature, target = feature.to(device), target.to(device)

            optimizer.zero_grad()
            logit = model(feature)

            #print('logit vector', logit.size())
            #print('target vector', target.size())
            loss = F.cross_entropy(logit, target)
            loss.backward()
            optimizer.step()

            steps += 1
            if steps % 100 == 0:
                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
                accuracy = 100.0 * corrects/batch.batch_size
                print(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps, 
                                                                             loss.item(), 
                                                                             accuracy,
                                                                             corrects,
                                                                             batch.batch_size))
            if steps % 100 == 0:
                dev_acc, valid_f1 = eval(dev_iter, model)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    #if args.save_best:
                     #   save(model, args.save_dir, 'best', steps)
                else:
                    if steps - last_step >= 1000:
                        print('early stop by {} steps.'.format(1000))
            


def eval(data_iter, model):
    model.eval()
    corrects, avg_loss,  epoch_f1 = 0 , 0, 0
    for batch in data_iter:
      
        #feature.data.t_(), target.data.sub_(1)  # batch first, index align

        feature, _ =  batch.reviewText
        target = batch.overall
        feature, target = feature.to(device), target.to(device)


        logit = model(feature)
        loss = F.cross_entropy(logit, target, size_average=False)
        f1 = f1_score(logit.argmax(dim=1).cpu().numpy(), batch.overall.cpu().numpy(), average='macro')


        avg_loss += loss.item()
        corrects += (torch.max(logit, 1)
                     [1].view(target.size()).data == target.data).sum()
        epoch_f1 += f1

    size = len(data_iter.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects/size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 
                                                                       accuracy, 
                                                                       corrects, 
                                                                       size))
    print(f'\t F1 score is {epoch_f1/len(data_iter)}')
    return accuracy, epoch_f1

In [7]:
from torch import optim

embed_num = vocab_size
embed_dim = 128
class_num = 3
kernel_num = 100
kernel_sizes = [3,4,5]
batch_size = 64
device = "cuda:0" if torch.cuda.is_available() else "cpu"

class MyIter:
    def __init__(self, it):
        self.it = it
    def __iter__(self):
        for batch in self.it:
            yield (batch.reviewText, batch.overall.unsqueeze(1))
    def __len__(self):
        return len(self.it)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_ds, valid_ds, test_ds), 
    batch_size=batch_size,
    device=device,
    sort_key=lambda x: len(x.reviewText),
    sort_within_batch=True)


cnn = CNN_Text(embed_num,embed_dim,class_num,kernel_num,kernel_sizes)
train(train_iterator, valid_iterator, cnn)
#train_iterator, valid_iterator, test_iterator = create_iterator(train_ds, valid_ds, test_ds, batch_size, device)






  4%|▎         | 99/2809 [00:22<09:58,  4.53it/s]

Batch[100] - loss: 1.104267  acc: 42.1875%(27/64)




KeyboardInterrupt: ignored

In [0]:
print(f"FINAL OUPUT IS \n{eval(test_iterator,cnn)}")