<a href="https://colab.research.google.com/github/ASIF-Mahmud1/Exploration/blob/text-classifier/NLP_Lessons/BERT/EnronDatasetDiminsihingTerms_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Install torchvision
!pip install torchvision




In [2]:
import torch

In [3]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [4]:
!pip install transformers


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 21.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 46.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=082ea97dc6

In [5]:
from transformers import BertTokenizer, BertModel
import torch
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [6]:
bert_transformer = BertTransformer(tokenizer, bert_model)
from sklearn.pipeline import Pipeline


In [7]:
url="https://raw.githubusercontent.com/ASIF-Mahmud1/Exploration/text-classifier/DiminishingTerms/dataSet.csv"
from io import StringIO
import string
import pandas as pd
import requests
s=requests.get(url).text

message_data=pd.read_csv(StringIO(s))
message_data['tag'] = message_data['tag'].str.strip()

message_data.head()
message_data['tag'].describe()

count             315
unique              2
top       diminishing
freq              234
Name: tag, dtype: object

In [8]:
message_data_copy = message_data['sentence'].copy() 

# message_data_copy

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs',C=5.2)


In [10]:
pipeline = Pipeline(steps=  [
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
    ] )

pipeline.fit(message_data_copy,  message_data['tag'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('vectorizer',
                 BertTransformer(bert_model=None, bert_tokenizer=None,
                                 embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x7f4601b43050>,
                                 max_length=60)),
                ('classifier',
                 LogisticRegression(C=5.2, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='multinomial', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [44]:
import pandas as pd
df = pd.read_csv('https://query.data.world/s/yqgskd2yg2lztwtgmpeajy75msskd6',error_bad_lines=False, index_col=False, dtype='unicode')
df = df.applymap(str) ## stringify all fields
# df['content']

In [41]:
collectionSize=10
startIndex=0
endIndex=startIndex + collectionSize

# Test Sentences Here
### **Note**: Click Play to Predict next 10 emails

In [43]:
####################################################################################################################
                       ## Remove this later
import pandas as pd
import altair as alt
import numpy as np
text= df['content']
text= text[startIndex:endIndex]
prob_Of_Each_Class = pipeline.predict_proba(text)
X = np.arange(startIndex,endIndex)
y_strong= prob_Of_Each_Class[:,1]*100
y_diminishing=prob_Of_Each_Class[:,0]*100    

dfTest = pd.DataFrame(columns=('index', 'strong', 'diminish','sentence'))
dfTest['index']=X 
dfTest['strong']=y_strong 
dfTest['diminish']=y_diminishing
dfTest['sentence']=list(text)
gp_chart = (
  alt.Chart(dfTest,width=50 ,padding={"left": 50, "top": 5, "right": 5, "bottom": 5} )
    .transform_fold(["diminish", "strong"], as_=["key", "value"])
    .mark_bar()  # size=50
    .encode(
        x="key:N",
        y="value:Q",
        color="key:N",
        column="index",
        tooltip="sentence"
    )   # .interactive() 
)

startIndex= startIndex+collectionSize
endIndex=startIndex+collectionSize
gp_chart.display()
# predict the label using the pipeline
for row in text.index:
    print(row, " "+text[row])
                               ## Remove this later
####################################################################################################################

10  Mr. Buckner, For delivered gas behind San Diego, Enron Energy Services is the appropriate Enron entity. I have forwarded your request to Zarin Imam at EES. Her phone number is 713-853-7107. Phillip Allen
11  Lucy, Here are the rentrolls: Open them and save in the rentroll folder. Follow these steps so you don't misplace these files. 1. Click on Save As 2. Click on the drop down triangle under Save in: 3. Click on the (C): drive 4. Click on the appropriate folder 5. Click on Save: Phillip
12  ---------------------- Forwarded by Phillip K Allen/HOU/ECT on 10/09/2000 02:16 PM --------------------------- Richard Burchfield 10/06/2000 06:59 AM To: Phillip K Allen/HOU/ECT@ECT cc: Beth Perlman/HOU/ECT@ECT Subject: Consolidated positions: Issues & To Do list Phillip, Below is the issues & to do list as we go forward with documenting the requirements for consolidated physical/financial positions and transport trade capture. What we need to focus on is the first bullet in Allan's list; the n