In [1]:
import re

import pandas as pd
import torch
from torch.functional import norm

from torchtext import data
from torchtext.data import Field

from transformers import BertTokenizer, BertModel

PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [2]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        #cls == DataFrameDataset

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [3]:

def normalize_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"\#", "", text)
    text = re.sub(r"http\S+", "URL", text)  # remove URL addresses
    text = re.sub(r"@", "", text)
    text = re.sub(r"[^A-Za-z0-9()!?\'\`\"]", " ", text)
    text = re.sub("\s{2,}", " ", text)
    return text

In [11]:
df = pd.read_csv("../data/spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
labels = df['target']
texts = df['text']
texts = texts.apply(normalize_text)

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [12]:
texts

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in 2 a wkly comp to win fa cup fina...
3            u dun say so early hor u c already then say 
4       nah i don't think he goes to usf he lives arou...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                   will b going to esplanade fr home?
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i'd...
5571                            rofl its true to its name
Name: text, Length: 5572, dtype: object

In [65]:
TEXT = Field(sequential=True,
             use_vocab=False,
             tokenize=tokenizer
             )

LABEL = Field(sequential=False,
              use_vocab=False,
              preprocessing=lambda x: 1 if x == "spam" else 0,
              is_target=True)

fields = [('text', TEXT), ('label', LABEL)]

In [66]:
datasets = DataFrameDataset.splits(fields=fields, train_df=df)

In [67]:
dataset = datasets[0]

In [72]:
dataset[0].__dict__

{'text': {'input_ids': [101, 2175, 2127, 18414, 17583, 2391, 1010, 4689, 1012, 1012, 2800, 2069, 1999, 11829, 2483, 1050, 2307, 2088, 2474, 1041, 28305, 1012, 1012, 1012, 25022, 2638, 2045, 2288, 26297, 28194, 1012, 1012, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 'label': 0}

In [71]:
tokenizer.convert_ids_to_tokens(dataset[0].__dict__['text']['input_ids'])

['[CLS]',
 'go',
 'until',
 'ju',
 '##rong',
 'point',
 ',',
 'crazy',
 '.',
 '.',
 'available',
 'only',
 'in',
 'bug',
 '##is',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '.',
 '.',
 '.',
 'ci',
 '##ne',
 'there',
 'got',
 'amore',
 'wat',
 '.',
 '.',
 '.',
 '[SEP]']