<a href="https://colab.research.google.com/github/ASIF-Mahmud1/Exploration/blob/text-classifier/NLP_Lessons/BERT/CreateTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [8]:
!pip install transformers




In [9]:
from transformers import BertTokenizer, BertModel
import torch
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [32]:
import pandas as pd
figure8_df= pd.read_csv('https://query.data.world/s/7njms3ftvoafvlmkenfd633lxe7wkm')
figure8_df= figure8_df[:1000]
figure8_df.size
figure8_df.head()
figure8_df[:1]['content']

0    @tiffanylue i know  i was listenin to bad habi...
Name: content, dtype: object

In [11]:
import numpy as np
split = np.random.choice(
    ["train", "val", "test"],
    size=figure8_df.shape[0],
    p=[.7, .15, .15]
)
figure8_df["split"] = split
x_train = figure8_df[figure8_df["split"] == "train"]
y_train = x_train["sentiment"]

In [12]:
bert_transformer = BertTransformer(tokenizer, bert_model)
from sklearn import svm
from sklearn.pipeline import Pipeline
classifier = svm.LinearSVC(C=1.0, class_weight="balanced")
model = Pipeline(
    [
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
    ]
)
model.fit(x_train["content"], y_train)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pipeline(memory=None,
         steps=[('vectorizer',
                 BertTransformer(bert_model=None, bert_tokenizer=None,
                                 embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x7f6437c42710>,
                                 max_length=60)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight='balanced', dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)

In [37]:
text=["wants to hang out with friends SOON!","She is just happy to be here.","It is so annoying","I am so excited"]
model.predict(text)

array(['neutral', 'happiness', 'neutral', 'surprise'], dtype=object)