In [1]:
#Install torchvision
!pip install torchvision




You should consider upgrading via the 'c:\users\kids\appdata\local\programs\python\python38\python3.exe -m pip install --upgrade pip' command.


In [2]:
import torch


In [3]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [4]:
!pip install transformers==3.0.0

Collecting transformers==3.0.0
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp38-cp38-win_amd64.whl (1.9 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.95-cp38-cp38-win_amd64.whl (1.2 MB)


You should consider upgrading via the 'c:\users\kids\appdata\local\programs\python\python38\python3.exe -m pip install --upgrade pip' command.


Installing collected packages: tokenizers, sentencepiece, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.1
    Uninstalling tokenizers-0.10.1:
      Successfully uninstalled tokenizers-0.10.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.4.2
    Uninstalling transformers-4.4.2:
      Successfully uninstalled transformers-4.4.2
Successfully installed sentencepiece-0.1.95 tokenizers-0.8.0rc4 transformers-3.0.0


In [5]:
from transformers import BertTokenizer, BertModel
import torch
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




In [6]:
bert_transformer = BertTransformer(tokenizer, bert_model)
from sklearn.pipeline import Pipeline


In [7]:
url="https://raw.githubusercontent.com/ASIF-Mahmud1/Exploration/diminishingTerms/DiminishingTerms/dataSet.csv"
from io import StringIO
import string
import pandas as pd
import requests
s=requests.get(url).text

message_data=pd.read_csv(StringIO(s))
message_data['tag'] = message_data['tag'].str.strip()

message_data.head()

Unnamed: 0,id,tag,sentence
0,1,diminishing,I�m no expert but �
1,2,strong,This is what I see
2,3,diminishing,It�s just my opinion �
3,4,strong,It�s my opinion�
4,5,diminishing,Just checking in �


In [8]:
message_data_copy = message_data['sentence'].copy() 

message_data_copy

0                        I�m no expert but �
1                         This is what I see
2                     It�s just my opinion �
3                           It�s my opinion�
4                         Just checking in �
                       ...                  
71               Sorry for the inconvinience
72    Sorry about the dog in the background�
73                    Sorry if I look tired�
74                     Sorry, I didn�t know 
75             Thank you for the information
Name: sentence, Length: 76, dtype: object

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs',C=5.2)


In [12]:
pipeline = Pipeline(steps=  [
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
    ] )

pipeline.fit(message_data_copy,  message_data['tag'])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was n

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was n

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was n

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.


AttributeError: 'str' object has no attribute 'decode'

In [18]:
text = ["Do it now", "Sorry, I got it wrong", "Please Let me  go", "right now!"]
# predict the label using the pipeline
# pipeline.predict(text)
pipeline



Pipeline(steps=[('vectorizer',
                 BertTransformer(bert_model=None, bert_tokenizer=None,
                                 embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x0000000034FC51F0>)),
                ('classifier',
                 LogisticRegression(C=5.2, multi_class='multinomial'))])

In [19]:
print("Classes in order ",pipeline.classes_)

Classes in order  ['diminishing' 'strong']


In [21]:
# prob_Of_Each_Class = pipeline.predict_proba(text)
#print('Predicted Probabilities: %s' % prob_Of_Each_Class)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was n

AttributeError: 'list' object has no attribute 'shape'

## Connect To Watson

In [None]:
!pip install watson-machine-learning-client-V4


In [None]:
api_key = 'lrVZNLXATm9qTRWMsIis2ulpWa_irVtMI7Rb8zos74b-'
location = 'us-south'

wml_credentials = {
    "apikey": api_key,
    "url": 'https://' + location + '.ml.cloud.ibm.com'
}

In [None]:
from ibm_watson_machine_learning import APIClient
wml_client = APIClient(wml_credentials)

In [None]:
space_id = '448b44c1-e98d-4847-9867-c1f30704bd93'
client.set.default_space(space_id)

sofware_spec_uid = client.software_specifications.get_id_by_name("default_py3.7")

metadata = {
            client.repository.ModelMetaNames.NAME: 'Multinomial Regression model Moment of truth',
            client.repository.ModelMetaNames.TYPE: 'scikit-learn_0.23',
            client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: sofware_spec_uid
}

In [None]:
published_model = client.repository.store_model(
    model=pipeline,
    meta_props=metadata)

In [11]:
client.repository.list()

NameError: name 'client' is not defined