In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from importlib_resources import files
from symspellpy import SymSpell
import re

from utility.decorators import print_func_name
from utility.paths import UtilityPath, DataPath

## PreprocessingUtils

In [2]:
# Setup tqdm verbose
tqdm.pandas()

# Setup nltk weights
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

EMOTICONS_GLOVE = {
    '<smile>': [':-]', '0:3', '8)', ":'-)", '>^_^<', '(^_^)', "(';')", ':*',
                '(^^)/', ':)', ':>', '(*_*)', '(^^)v', '=3', ':}', ';^)', ':->', '^_^;',
                '=)', '(^o^)', '*)', '(^.^)', '^_^', '\\o/', '^5', '(__)', '(#^.^#)', '0:)',
                '(^^)', ';]', ':-*', ':^)', ':3', '(+_+)', ';)', ":')", '(:', ':-3', ':-}',
                ';-)', ':-)', ':]', '*-)', 'o/\\o', '=]', '(^_-)', '8-)', ':o)', ':c)',
                '(^_^)/', '(o.o)', ':o', '>:)', '8-0', ':-0', ';3', '>:3', '3:)', ':-o',
                '}:)', 'o_0', '^^;', 'xx', 'xxx', '^o^', ':d', ' c:'],
    '<lolface>': [':-p', ':p', ':b', ':-b', 'x-p', '=p'],
    '<heart>': ['<3'],
    '<neutralface>': ['=\\', '>:/', '(..)', '(._.)', ':-/', ':|', '>.<', ':-.',
                      "('_')", '=/', ':/', ':#', '(-_-)', 'o-o', 'o_o', ':$', '>:\\', ':@', ':-|',
                      '><>', '(-.-)', ':\\', '<+', ':-@'],
    '<sadface>': [';(', '(~_~)', ':c', ':[', ':-&', ':(', '>:[', ':&', ':-c',
                  ';n;', ":'(", ';;', ':-[', ';-;', '%)', ':<', '<\\3', ':{', ';_;', '=(',
                  'v.v', 'm(__)m', '</3', ":'-(", ':-<']
}


class PreprocessingUtils:
    def __init__(self):
        self._symspell = None

    def _get_symspell(self):
        """
        Instantiates a `SymSpell` object.

        :return: instantiated object
        :rtype: SymSpell
        """
        # If it's not already instantiated
        if self._symspell is None:
            # Instantiating `SymSpell`
            self._symspell = SymSpell()

            # Getting dictionary for single words
            dictionary_path = files("symspellpy").joinpath("frequency_dictionary_en_82_765.txt")
            self._symspell.load_dictionary(
                dictionary_path, term_index=0, count_index=1
            )

            # Getting dictionary for bigram (two words)
            bigram_path = files("symspellpy").joinpath("frequency_bigramdictionary_en_243_342.txt")
            self._symspell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2
            )

        return self._symspell

    def word_segmentation(self, text):
        """
        Tries to put spaces between words in a text (used for hashtags).
        (e.g.: helloguys --> hello guys)

        :param text: Text to be converted (typically a hashtag)
        :type text: str

        :return: Processed text
        :rtype: str
        """
        # `max_edit_distance = 0` avoids that `SymSpell` corrects spelling.
        result = self._get_symspell().word_segmentation(
            text, max_edit_distance=0
        )
        return result.segmented_string

    def correct_spelling(self, text):
        """
        Corrects the spelling of a word (e.g.: helo -> hello)

        :param text: Text to be converted
        :type text: str

        :return: Processed text
        :rtype: str
        """
        # `max_edit_distance = 2` tells `SymSpell` to check at a maximum distance
        # of 2 in the vocabulary. Only words with at most 2 letters wrong will be corrected.
        result = self._get_symspell().lookup_compound(
            text, max_edit_distance=2
        )

        return result[0].term

    @staticmethod
    def _get_wordnet_tag(nltk_tag):
        """
        Returns the type of word according to the nltk pos tag.

        :param nltk_tag: nltk pos tag
        :type nltk_tag: list(tuple(str, str))

        :return: type of word
        :rtype: str
        """
        if nltk_tag.startswith("V"):
            return wordnet.VERB
        elif nltk_tag.startswith("N"):
            return wordnet.NOUN
        elif nltk_tag.startswith("J"):
            return wordnet.ADJ
        elif nltk_tag.startswith("R"):
            return wordnet.ADV
        else:
            # This is the default in WordNetLemmatizer when no pos tag is passed
            return wordnet.NOUN

    @staticmethod
    def lemmatize(text):
        """
        Performs lemmatization using nltk pos tag and `WordNetLemmatizer`.

        :param text: Text to be processed
        :type text: str
        :return: processed text
        :rtype: str
        """
        nltk_tagged = nltk.pos_tag(text.split())
        lemmatizer = WordNetLemmatizer()

        return " ".join(
            [
                lemmatizer.lemmatize(w, self._get_wordnet_tag(nltk_tag))
                for w, nltk_tag in nltk_tagged
            ]
        )

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Preprocessing

In [17]:
class Preprocessing:
    def __init__(self, path_ls: list, is_test: bool = False):
        if len(path_ls) > 2 or len(path_ls) < 1:
            raise ValueError("Length of path should be 1 or 2.")

        self._is_test = is_test
        self._path_ls = path_ls
        self._prep_utils = PreprocessingUtils()

        self.df = self._load_data()

    def _load_data(self):
        if len(self._path_ls) == 1 and self._is_test:
            return self._load_test_data()
        else:
            return self._load_train_data()

    def _load_train_data(self):
        if len(self._path_ls) == 1 and self._path_ls[0].find("csv") != -1:
            return pd.read_csv(self._path_ls[0])

        is_neg = -1
        dfs = []

        for path in self._path_ls:
            if "neg" not in path:
                is_neg = 1
            with open(path) as f:
                content = f.read().splitlines()

            _df = pd.DataFrame({"text": content, "label": np.ones(len(content)) * is_neg})
            dfs.append(_df)

        df = pd.concat(dfs, ignore_index=True)
        df["text"] = df["text"].str.lower()
        df["label"] = df["label"].astype("int64")
        return df

    def _load_test_data(self):
        with open(self._path_ls[0]) as f:
            content = f.read().splitlines()

        ids = [line.split(",")[0] for line in content]
        texts = [",".join(line.split(",")[1:]) for line in content]

        df = pd.DataFrame({"ids": ids, "text": texts})
        df["text"] = df["text"].str.lower()
        return df

    def __get__(self) -> pd.DataFrame:
        return self.df

    @print_func_name
    def __len__(self) -> int:
        return len(self.df)

    @print_func_name
    def shape(self) -> tuple:
        return self.df.shape

    @print_func_name
    def create_raw(self):
        self.df["raw"] = self.df["text"]

    @print_func_name
    def strip(self):
        self.df["text"] = self.df["text"].str.strip()

    @print_func_name
    def remove_tag(self):
        self.df["text"] = self.df["text"].str.replace("<[\w]*>", "", regex=True)
        self.strip()

    @print_func_name
    def remove_space_before_symbol(self):
        emo_list = [el for value in list(EMOTICONS_GLOVE.values()) for el in value]
        emo_with_spaces_pattern = re.compile('|'.join(re.escape(' '.join(emo)) for emo in emo_list))
        all_non_alpha_emo_pattern = re.compile(
            '|'.join(re.escape(emo) for emo in emo_list if not any(char.isalpha() or char.isdigit() for char in emo)))

        # Define a function to handle replacement
        def _replace_func(match):
            text = match.group()
            if emo_with_spaces_pattern.match(text):
                return text.replace(" ", "")
            return f' {text} '

        # Applying the transformations
        self.df["text"] = self.df["text"].progress_apply(lambda x: re.sub(all_non_alpha_emo_pattern, _replace_func, x))

    @print_func_name
    def remove_extra_space(self):
        self.df["text"] = self.df["text"].progress_apply(lambda text: " ".join(text.split()))
        self.df.reset_index(inplace=True, drop=True)

    @print_func_name
    def remove_ellipsis(self):
        self.df["text"] = self.df["text"].str.replace(r'\.{3}$', '', regex=True)

    @print_func_name
    def remove_hashtag(self):
        self.df["text"] = self.df["text"].str.replace("#", " ")
        # self.df["text"] = self.df["text"].progress_apply(self._prep_utils._word_segmentation)

    @print_func_name
    def remove_space_after_quote(self):
        def _find_pattern(text):
            pattern = r'(("[^"]*")|(\'[^\']*\'\s))'
            return re.sub(pattern, lambda match: match.group(1).replace(' ', ''), text)

        self.df["text"] = self.df["text"].progress_apply(_find_pattern)

    @print_func_name
    def reconstruct_emoji(self):
        print("inside")

        def _find_unmatched_parentheses(text):
            open_stack = []  # Stack to keep track of indices of '('
            unmatched_indices = []  # List to store indices of unmatched parentheses

            for i, char in enumerate(text):
                if char == '(':
                    open_stack.append(i)  # Push the index of '(' onto the stack
                elif char == ')':
                    if open_stack:
                        open_stack.pop()  # Pop the last '(' as it's a matched pair
                    else:
                        unmatched_indices.append(i)  # Unmatched ')'

            # Add remaining indices from the stack to unmatched_indices
            unmatched_indices.extend(open_stack)

            return sorted(unmatched_indices)

        def _add_colon(text) -> str:
            unmatched_indices = _find_unmatched_parentheses(text)
            if len(unmatched_indices) == 0:
                return text

            char_t = list(text)

            for i, index in enumerate(unmatched_indices):
                char_t.insert(index + i, ':')

            return "".join(char_t)

        self.df["text"] = self.df["text"].progress_apply(_add_colon)

    @print_func_name
    def drop_duplicates(self):
        self.df = self.df.drop_duplicates(subset=['text'])
        self.df = self.df.dropna().reset_index(drop=True)

    @print_func_name
    def lemmatize(self):
        self.df["text"] = self.df["text"].progress_apply(self._prep_utils.lemmatize)

    @print_func_name
    def correct_spelling(self):
        self.df["text"] = self.df["text"].progress_apply(self._prep_utils.correct_spelling)

    @print_func_name
    def remove_stopwords(self):
        _stopwords = set(stopwords.words("english"))

        # Removing stopwords for each tweet
        self.df["text"] = self.df["text"].progress_apply(
            lambda text: " ".join(
                [word for word in str(text).split() if word not in _stopwords]
            )
        )

    @print_func_name
    def slang_to_word(self):
        # https://github.com/Zenexer/internet-reference/blob/main/Internet%20Slang%20and%20Emoticons.md
        slang_doc = pd.read_csv(UtilityPath.SLANG).set_index('slang')['text'].to_dict()

        def _find_slang(text: str) -> str:
            new_text = []
            _default_value = "<this-is-default-value>"

            for word in text.split():
                _value = slang_doc.get(word, _default_value)
                if _value != _default_value:
                    new_text.append(_value)
                else:
                    new_text.append(word)

            return " ".join(new_text)

        self.df["text"] = self.df["text"].progress_apply(_find_slang)

    @print_func_name
    def fillna(self):
        self.df["text"] = self.df["text"].filna("<empty-text>")

In [4]:
from abc import ABC, abstractmethod
from time import strftime

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


class Model(ABC):
    def __init__(self, weight_path: str, submission_path: str, is_weight: bool = False, seed: int = 42):
        self.weight_path = weight_path
        self.submission_path = submission_path
        self.is_weight = is_weight
        self.seed = seed

    @abstractmethod
    def train(self, x, y, batch_size: int, epochs: int):
        pass

    @abstractmethod
    def preprocessing(self, is_train: bool = True):
        pass

    @abstractmethod
    def predict(self, x):
        pass

    def split_data(self, x, y, test_size: float, shuffle: bool = True, **kwargs):
        return train_test_split(x, y, test_size=test_size, shuffle=shuffle, random_state=self.seed, **kwargs)

    def submit(self, predictions: np.ndarray | list):
        submission = pd.DataFrame({"Id": np.arange(len(predictions)), "Prediction": predictions})
        submission = submission.astype(int).replace(0, -1)

        submission.to_csv(f"{self.submission_path}/submission_{strftime('%Y-%m-%d_%H:%M:%S')}.csv", index=False)


## BERT

In [37]:
from models.base import Model

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures, AdamWeightDecay, WarmUp
import tensorflow as tf
from keras import optimizers, losses, metrics

import numpy as np
from tqdm.auto import tqdm


class Bert(Model):
    def __init__(self,
                 weight_path: str = "",
                 submission_path: str = "",
                 is_weight: bool = False,
                 seed: int = 42,
                 max_length: int = 128):
        super().__init__(weight_path, submission_path, is_weight, seed)

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.max_length = max_length

        # Load weights
        if self.is_weight:
            self.model = TFBertForSequenceClassification.from_pretrained(self.weight_path)
        else:
            self.model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

    def preprocessing(self, is_train: bool = True):
        steps = [
            "remove_tag",
            "remove_ellipsis",
            "reconstruct_emoji",
            "remove_extra_space",
            "remove_space_before_symbol",
            "remove_extra_space"
        ]

        if is_train:
            return ["drop_duplicates"] + steps

        return steps

    def predict(self, x: np.ndarray):
        predictions = []

        for i, tweet in enumerate(tqdm(x, desc="Generating predictions")):
            feature = self.tokenizer.encode_plus(text=tweet, return_tensors='tf')
            output = self.model(feature)[0].numpy().squeeze().argmax()
            predictions.append(output)

        self.submit(predictions)

    def create_tf_dataset(self, x, y):
        features = []

        for text, label in tqdm(zip(x, y), desc="Tokenizing data", total=len(x)):
            input_dict = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding="max_length",
                return_attention_mask=True,
                return_token_type_ids=True,
                truncation=True
            )

            input_ids, attention_mask = (
                input_dict['input_ids'],
                input_dict['attention_mask'])

            features.append(
                InputFeatures(
                    input_ids=input_ids, attention_mask=attention_mask,
                    label=label
                )
            )

        def _generator():
            for feature in tqdm(features, desc="Generating features"):
                yield (
                    {
                        "input_ids": feature.input_ids,
                        "attention_mask": feature.attention_mask,
                        "token_type_ids": feature.token_type_ids,
                    },
                    feature.label,
                )

        return tf.data.Dataset.from_generator(
            _generator,
            ({
                 'input_ids': tf.int32,
                 'attention_mask': tf.int32,
             }, tf.int64),
            ({
                 'input_ids': tf.TensorShape([None]),
                 'attention_mask': tf.TensorShape([None]),
             }, tf.TensorShape([]),),
        )

    def train(self, x, y, batch_size: int, epochs: int):
        X_train, X_val, y_train, y_val = self.split_data(x, y, test_size=0.1)

        

        # train_ie = []
        # for text, label in tqdm(zip(X_train, y_train), desc="Creating `InputExample` for training"):
        #     train_ie.append(
        #         InputExample(guid="", text_a=text, text_b=None, label=label))

        # val_ie = []
        # for text, label in tqdm(zip(X_test, y_test), desc="Creating `InputExample` for validating"):
        #     val_ie.append(
        #         InputExample(guid="", text_a=text, text_b=None, label=label))

        train_data = self.create_tf_dataset(X_train, y_train).shuffle(self.max_length // 2,
                                                              reshuffle_each_iteration=True).batch(batch_size)
        val_data = self.create_tf_dataset(X_val, y_val).batch(batch_size)

        steps_per_epoch = len(X_train) // batch_size
        num_train_steps = steps_per_epoch * epochs

        print(f"Training steps: {num_train_steps}")

        lr = 2e-5
        opt_epsilon = 1.5e-8

        decay_schedule = optimizers.schedules.PolynomialDecay(
            initial_learning_rate=lr,
            decay_steps=num_train_steps,
            end_learning_rate=0)

        warmup_schedule = WarmUp(
            initial_learning_rate=lr,
            decay_schedule_fn=decay_schedule,
            warmup_steps=(num_train_steps * 0.1))

        optimizer = AdamWeightDecay(learning_rate=warmup_schedule,
                                    epsilon=opt_epsilon,
                                    clipnorm=1.0)

        loss = losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = metrics.SparseCategoricalAccuracy("accuracy")

        self.model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

        print("Fitting model")
        self.model.fit(train_data, epochs=epochs, validation_data=val_data)

        print("Saving weights")
        self.model.save_pretrained(self.weight_path)


## Prep

In [13]:
TRAIN_NEG_FULL = "./data/train_neg_full.txt"
TRAIN_POS_FULL = "./data/train_pos_full.txt"

TRAIN_NEG = "./data/train_neg.txt"
TRAIN_POS = "./data/train_pos.txt"

TEST_DATA = "./data/test_data.txt"

BERT_TRAIN_PREP = "./data/preprocessed/bert/train.csv"
BERT_TEST_PREP = "./data/preprocessed/bert/test.csv"
BERT_WEIGHT = "./weights/bert"
BERT_SUBMISSION = "./submissions/bert"

In [14]:
train_prep = Preprocessing([TRAIN_NEG, TRAIN_POS])
test_prep = Preprocessing([TEST_DATA], is_test=True)

In [38]:
MAX_LEN = 128

bert = Bert(weight_path=BERT_WEIGHT,
            submission_path=BERT_SUBMISSION,
            max_length=128)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from tqdm.auto import tqdm

for step in tqdm(bert.preprocessing(), desc="Preprocessing train data"):
    getattr(train_prep, step)()

Preprocessing train data:   0%|          | 0/7 [00:00<?, ?it/s]

Executing: `drop_duplicates`
Executing: `remove_tag`
Executing: `strip`
Executing: `remove_ellipsis`
Executing: `reconstruct_emoji`
inside



  0%|                                                                                        | 0/181307 [00:00<?, ?it/s][A

vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo 
vinco tresorpack 6 :( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo 
introduction to programming with c + + ( 2nd edition this solid foundation in the basics of c + + programming will 
introduction to programming with c + + :( 2nd edition this solid foundation in the basics of c + + programming will 
the post-boom in spanish american fiction ( suny series in latin american and iberian thought and culture what 
the post-boom in spanish american fiction :( suny series in latin american and iberian thought and culture what 
layers of the heart ( paperback this journey was inspired by a recent robbery that took place in the united sta 
layers of the heart :( paperback this journey was inspired by a recent robbery that took place in the united sta 
guess who texted me again and wants us back ( ( (
guess who texted me again and wants 


  6%|████                                                                    | 10306/181307 [00:00<00:01, 103051.99it/s][A
 12%|████████▍                                                               | 21316/181307 [00:00<00:01, 107192.09it/s][A

no boss us angels are always busy at work ! ! ( x
no boss us angels are always busy at work ! ! :( x
a course in phonetics ( paperback the easy to understand approach builds on the basics , beginning with technica 
a course in phonetics :( paperback the easy to understand approach builds on the basics , beginning with technica 
1 ft ( 1ft ) cat 6 ethernet network patch cable black rj45 m / m ( 10 pack ultra spec cables 10 pack of 1ft cat 6 eth 
1 ft ( 1ft ) cat 6 ethernet network patch cable black rj45 m / m :( 10 pack ultra spec cables 10 pack of 1ft cat 6 eth 
stay healthy : actelion sees data on big drug hope in coming weeks ( reuters share with friends : | | health - 
stay healthy : actelion sees data on big drug hope in coming weeks :( reuters share with friends : | | health - 
) pyle plmr 24 3.5- inch 200 watt 3 - way weather proof mini box speaker system ( white ) ( pair
:) pyle plmr 24 3.5- inch 200 watt 3 - way weather proof mini box speaker system ( white ) :( pair
organic fa


 18%|████████████▋                                                           | 32036/181307 [00:00<00:01, 102242.96it/s][A
 23%|█████████████████                                                        | 42287/181307 [00:00<00:01, 97203.54it/s][A

se diamond wheel set 6pc dw13 1/8 " shank ( misc . with with breathing hole , size 7/8 in . , arbor 1/8 in . thicknes 
se diamond wheel set 6pc dw13 1/8 " shank :( misc . with with breathing hole , size 7/8 in . , arbor 1/8 in . thicknes 
over the cabinet organizer basket ( clear ) ( 5.5 " h x 8 " w x 3.25 " d organize small items under the sink and inside 
over the cabinet organizer basket ( clear ) :( 5.5 " h x 8 " w x 3.25 " d organize small items under the sink and inside 
remember me 3 : the last story ( mass market paperback someone did not like shari cooper writing her stories . sh 
remember me 3 : the last story :( mass market paperback someone did not like shari cooper writing her stories . sh 
09x34 custom picture frame / poster frame . 765 " wide complete matte black frame ( fw2bk this frame is manufactu 
09x34 custom picture frame / poster frame . 765 " wide complete matte black frame :( fw2bk this frame is manufactu 
kimble kimax , single-scale graduated cylinder class b w


 29%|█████████████████████                                                    | 52237/181307 [00:00<00:01, 97988.86it/s][A

italian with book and sticker and cassette ( s ) and other ( learn in your car kids includes activity book , sticke 
italian with book and sticker and cassette ( s ) and other :( learn in your car kids includes activity book , sticke 
pelican 2390-000- 110 2390 b m6 tactical light ( electronics-other pelican 2390-000- 110 2390 b m6 tactical light 
pelican 2390-000- 110 2390 b m6 tactical light :( electronics-other pelican 2390-000- 110 2390 b m6 tactical light 
javascript in javascript ( js.js sandboxing third-party scripts  via
javascript in javascript :( js.js sandboxing third-party scripts  via
webroot antivirus with spy sweeper 3 - user [ old version ] ( cd-rom award-winning protection against viruses and spyware
webroot antivirus with spy sweeper 3 - user [ old version ] :( cd-rom award-winning protection against viruses and spyware
heavy duty 5ft z-rack - double rail with 6ft uprights ( silver / black ) ( 79 " h x 63 " w x 23 " d with a 500 lb . load c 
heavy duty 5ft z-rack - do


 35%|█████████████████████████                                               | 63210/181307 [00:00<00:01, 101842.77it/s][A

dark corner [ vhs ] :( vhs tape the dark corner can't seriously be proposed as a great film noir , but it's one tha 
dan-d-pak cashews garlic , 16 - ounce plastic jars ( pack of 3
dan-d-pak cashews garlic , 16 - ounce plastic jars :( pack of 3
cellet microsd 4gb memory card for blackberry curve 8300 phone with sd adapter . ( lifetime warranty cellet micr 
cellet microsd 4gb memory card for blackberry curve 8300 phone with sd adapter . :( lifetime warranty cellet micr 
black cohosh root powder wildcrafted - cimicifuga racemosa , 1 lb , ( starwest botanicals 1 lb . of wild-crafted bl 
black cohosh root powder wildcrafted - cimicifuga racemosa , 1 lb , :( starwest botanicals 1 lb . of wild-crafted bl 
at tl71308 5.8 ghz digital three handset cordless telephone ( titanium and metallic charcoal for absolute simp 
at tl71308 5.8 ghz digital three handset cordless telephone :( titanium and metallic charcoal for absolute simp 
please follow me .. i love u ( ( (
please follow me .. i love u :( 


 41%|█████████████████████████████▍                                          | 74242/181307 [00:00<00:01, 103578.94it/s][A

harry and david peach salsa medium ( economy case pack ) 16 oz jar ( pack of 12 harry and david peach salsa mediu 
harry and david peach salsa medium ( economy case pack ) 16 oz jar :( pack of 12 harry and david peach salsa mediu 
dilbert and the way of the weasel ( hardcover in the weasel zone , where most people reside , everything is misle 
dilbert and the way of the weasel :( hardcover in the weasel zone , where most people reside , everything is misle 
citizen men's bw0072 - 07p eco-drive gold-tone leather watch ( watch for casual fridays or relaxing weekends , thi 
citizen men's bw0072 - 07p eco-drive gold-tone leather watch :( watch for casual fridays or relaxing weekends , thi 
american diabetes association complete guide to diabetes ( mass market paperback the most complete self-care gu 
american diabetes association complete guide to diabetes :( mass market paperback the most complete self-care gu 
1989 kawasaki vn 750 a vulcan left or rear headpipe ( on dual headpipe bikes 1


 47%|█████████████████████████████████▊                                      | 85030/181307 [00:00<00:00, 104914.70it/s][A
 54%|███████████████████████████████████████▏                                | 98757/181307 [00:00<00:00, 114872.37it/s][A

10x31 custom picture frame / poster frame . 77 " wide complete brown wood frame ( 586138 this frame is manufactur 
10x31 custom picture frame / poster frame . 77 " wide complete brown wood frame :( 586138 this frame is manufactur 
they almost always come home ( paperback when libbys husband greg fails to return from a two-week canoe trip t 
they almost always come home :( paperback when libbys husband greg fails to return from a two-week canoe trip t 
archos 604 30gb wi-fi , ultra-slim portable digital media player and recorder ( 50872 ) ( electronics store up to 
archos 604 30gb wi-fi , ultra-slim portable digital media player and recorder ( 50872 ) :( electronics store up to 
hp compaq pavilion dv4 - 1019tx notebook / laptop battery 8800mah high capacity ( replacement hp compaq pavilion 
hp compaq pavilion dv4 - 1019tx notebook / laptop battery 8800mah high capacity :( replacement hp compaq pavilion 
against the gods : the remarkable story of risk ( kindle edition a business week , n




- beat the p * * * y uppp )
- beat the p * * * y uppp :)
it's much easier to turn a friendship into love , than love into a friendship . -   ) )
it's much easier to turn a friendship into love , than love into a friendship . -   :) :)
way to get followed by me + shoutout . 1 ) rt this 2 ) follow  3 ) tweet me if you done
way to get followed by me + shoutout . 1 :) rt this 2 :) follow  3 :) tweet me if you done
i'd rather lay in his bed eat his food an watch tv all day (
i'd rather lay in his bed eat his food an watch tv all day :(
hahaha my teacher also calling me ra3ad .. but in fact my name is re9 ' a lol (  live on
hahaha my teacher also calling me ra3ad .. but in fact my name is re9 ' a lol :(  live on
w  bout to do some big thangs )
w  bout to do some big thangs :)
hahaha agree with ain ) rt  okay gd you dont have to promote yourself coz we all know you are a genius ! lol
hahaha agree with ain :) rt  okay gd you dont have to promote yourself coz we all know you are a genius ! lol


 63%|████████████████████████████████████████████▉                          | 114679/181307 [00:01<00:00, 128415.00it/s][A
 71%|██████████████████████████████████████████████████▎                    | 128573/181307 [00:01<00:00, 131612.05it/s][A

let me know if i can give you anything other than my unrelenting love and affection ; )
let me know if i can give you anything other than my unrelenting love and affection ; :)
i wish the best for class of 2012 :d hope yall succeed in life . hate to see yall leave but i love to see yall go )
i wish the best for class of 2012 :d hope yall succeed in life . hate to see yall leave but i love to see yall go :)
i kno them hoes talking bout me . but who gives a flying fuck )
i kno them hoes talking bout me . but who gives a flying fuck :)
how can you be so witty ? )
how can you be so witty ? :)
just ready for next semester with  comin to uncc ) and plenty of road trips )
just ready for next semester with  comin to uncc :) and plenty of road trips :)
have fun ) cant wait to hear bott it
have fun :) cant wait to hear bott it
say something in spanish please x (  live on
say something in spanish please x :(  live on
be classy , anything but trashy . " if your trashy i delete u " haha  does this 


 80%|████████████████████████████████████████████████████████▉              | 145498/181307 [00:01<00:00, 143000.10it/s][A


i'll explain everything next time we hang out :)
tomorrow = the lucky one with trev ) he doesn't know it yet #yayromance
tomorrow = the lucky one with trev :) he doesn't know it yet #yayromance
aba huma-hi follow me please pa ha . ) ) )
aba huma-hi follow me please pa ha . :) :) :)
wearing my bob marley shirt and rosta bracelet today . and my new glasses )
wearing my bob marley shirt and rosta bracelet today . and my new glasses :)
i'm in one of " dem moods " but i've grown and matured soo iaint gon do it )
i'm in one of " dem moods " but i've grown and matured soo iaint gon do it :)
i don't wanna spoil it ! it's more bloody than it is scary ! ! )
i don't wanna spoil it ! it's more bloody than it is scary ! ! :)
hey good looking what you got cooking lol ) its friday zozo a day to be cheeky . old song btw
hey good looking what you got cooking lol :) its friday zozo a day to be cheeky . old song btw
let's all give our thanks to the lord for this day . ) goodmorning mr .  always cheer up


 90%|███████████████████████████████████████████████████████████████▋       | 162558/181307 [00:01<00:00, 151329.18it/s][A

yesss lindsey pavao is back in the game mofooos ) ) ) #youdabest
yesss lindsey pavao is back in the game mofooos :) :) :) #youdabest
happy tuesday to all my new followers * waving * cherrycube _    welcome all )
happy tuesday to all my new followers * waving * cherrycube _    welcome all :)
in such a great moood ! ) ) today's only gonna get better !
in such a great moood ! :) :) today's only gonna get better !
s / o to  for fukin wit da bluee ) ) ) oweee lol wats craccin
s / o to  for fukin wit da bluee :) :) :) oweee lol wats craccin
y'all ready for tomorrow ?        ) )
y'all ready for tomorrow ?        :) :)
thank you god , for give me beautiful night so i can open my mind , to give him a smile , and move on )
thank you god , for give me beautiful night so i can open my mind , to give him a smile , and move on :)
my twin 2 year old siisters ! are officially miss guided ) #teamomg
my twin 2 year old siisters ! are officially miss guided :) #teamomg
you sound like a bum . #imjealous h


100%|███████████████████████████████████████████████████████████████████████| 181307/181307 [00:01<00:00, 126134.27it/s][A


"   yung playing years mo sa nba oh ! ) xd  i sooo love this pic ! hahahahahahaha
"   yung playing years mo sa nba oh ! :) xd  i sooo love this pic ! hahahahahahaha
i'm done , i give up , i don't wanna pretend no more , that's it , so what ? - love me for me by cher lloyd . lss ) )
i'm done , i give up , i don't wanna pretend no more , that's it , so what ? - love me for me by cher lloyd . lss :) :)
i can't believe you drinking without us ! bacardi pa ahhh ! ! ! aynako ! ! hahahha )
i can't believe you drinking without us ! bacardi pa ahhh ! ! ! aynako ! ! hahahha :)
is it the ed sheeran part in the song ? ? ) answer please 
is it the ed sheeran part in the song ? ? :) answer please 
lol . noel & & noman ) ( if i spelled that correctly )
lol . noel & & noman :) ( if i spelled that correctly )
love of my life ) ) )
love of my life :) :) :)
photos : intuit uses downtown seattle street as driving range : seriously ? )  #golf #marketing #brand management #intuit
photos : intuit uses downto


  0%|                                                                                        | 0/181307 [00:00<?, ?it/s][A
 28%|████████████████████▏                                                   | 50721/181307 [00:00<00:00, 507205.65it/s][A
 60%|██████████████████████████████████████████▌                            | 108806/181307 [00:00<00:00, 550522.92it/s][A
100%|███████████████████████████████████████████████████████████████████████| 181307/181307 [00:00<00:00, 567068.25it/s][A


Executing: `remove_space_before_symbol`



  0%|                                                                                        | 0/181307 [00:00<?, ?it/s][A
 12%|████████▌                                                               | 21496/181307 [00:00<00:00, 214953.03it/s][A
 24%|█████████████████▌                                                      | 44314/181307 [00:00<00:00, 222731.12it/s][A
 38%|███████████████████████████▌                                            | 69385/181307 [00:00<00:00, 235505.52it/s][A
 53%|█████████████████████████████████████▊                                  | 95216/181307 [00:00<00:00, 244505.47it/s][A
 67%|███████████████████████████████████████████████▋                       | 121626/181307 [00:00<00:00, 251567.71it/s][A
 83%|███████████████████████████████████████████████████████████            | 150759/181307 [00:00<00:00, 265085.89it/s][A
100%|███████████████████████████████████████████████████████████████████████| 181307/181307 [00:00<00:00, 255612.51it/s][A


Executing: `remove_extra_space`



  0%|                                                                                        | 0/181307 [00:00<?, ?it/s][A
 27%|███████████████████▋                                                    | 49496/181307 [00:00<00:00, 494959.29it/s][A
 59%|██████████████████████████████████████████                             | 107438/181307 [00:00<00:00, 544639.28it/s][A
100%|███████████████████████████████████████████████████████████████████████| 181307/181307 [00:00<00:00, 543041.76it/s][A


In [19]:
for step in tqdm(bert.preprocessing(), desc="Preprocessing test data"):
    getattr(test_prep, step)()

Preprocessing test data:   0%|          | 0/7 [00:00<?, ?it/s]

Executing: `drop_duplicates`
Executing: `remove_tag`
Executing: `strip`
Executing: `remove_ellipsis`
Executing: `reconstruct_emoji`
inside



100%|███████████████████████████████████████████████████████████████████████████| 9215/9215 [00:00<00:00, 108903.82it/s][A


sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air , stay longer in the water and 
sea doo pro sea scooter :( sports with the portable sea-doo seascootersave air , stay longer in the water and 
its whatever . in a terrible mood ( (
its whatever . in a terrible mood :( :(
please say hi to denmark ! that would be amazing ! (  live on
please say hi to denmark ! that would be amazing ! :(  live on
3x3 custom picture frame / poster frame 1.2 " wide complete gold frame ( 2380763 9gd this frame is manufactured i 
3x3 custom picture frame / poster frame 1.2 " wide complete gold frame :( 2380763 9gd this frame is manufactured i 
nhl's bettman : suspension criticism ' gamesmanship ' ( the associated press new york ( ap ) nhl ...  #predators #nhl
nhl's bettman : suspension criticism ' gamesmanship ' :( the associated press new york ( ap ) nhl ...  #predators #nhl
barrel pickles ( 2.2 pound these german pickles are crisp , crunchy , and delicious . marinated in a lightly


100%|███████████████████████████████████████████████████████████████████████████| 9215/9215 [00:00<00:00, 424045.90it/s][A


Executing: `remove_space_before_symbol`



100%|███████████████████████████████████████████████████████████████████████████| 9215/9215 [00:00<00:00, 205129.56it/s][A


Executing: `remove_extra_space`



100%|███████████████████████████████████████████████████████████████████████████| 9215/9215 [00:00<00:00, 484233.02it/s][A


In [22]:
train_df = train_prep.__get__()
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv(BERT_TRAIN_PREP, index=False)

In [23]:
test_df = test_prep.__get__()
test_df.to_csv(BERT_TEST_PREP, index=False)

## Training

In [26]:
train_df = pd.read_csv(BERT_TRAIN_PREP)
train_df = train_df.dropna()

In [34]:
X, y = train_df['text'].values, train_df['label'].values
BATCH_SIZE = 24
EPOCHS = 3

In [39]:
bert.train(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS)

Tokenizing data:   0%|          | 0/163121 [00:00<?, ?it/s]

Tokenizing data:   0%|          | 0/18125 [00:00<?, ?it/s]

Training steps: 20388
Fitting model
Epoch 1/3



KeyboardInterrupt

