# About

Finetune transformer models using scikit like syntax.  


> The code and output already exist, let's just run through this worked notebook.


https://github.com/IndicoDataSolutions/finetune

> NOTE:  Change runtime to GPU


Cleanup:

- Save out train/test data
- Show decision tree after finetune
- Show logistic - not always needed, easier infra, faster.  Don't always assume latest/greatest applies to our problem!



In [None]:
# order of ops
# change to GPU
# install finetune
# restart runtime



# install -- per https://github.com/IndicoDataSolutions/finetune/issues/662
# tldr; Colab was choking
!pip install git+https://github.com/IndicoDataSolutions/finetune.git

> Restart the runtime

In [None]:
# imports
# https://github.com/IndicoDataSolutions/finetune
from finetune.base_models import BERT
from finetune import Classifier
import pandas as pd
from sklearn import metrics 

In [None]:
# get the dataset
SQL = "SELECT * from `questrom.datasets.topics`"
df = pd.read_gbq(SQL, "questrom")

In [None]:
# predict the topic
df.topic.value_counts(dropna=False)

Product Specifications    839
Product Availability      833
Product Comparison        806
Shipping                  799
Returns & Refunds         768
Sales/Promotions          505
Omnichannel               450
Name: topic, dtype: int64

![](https://jalammar.github.io/images/bert-classifier.png)

Great resource: https://jalammar.github.io/illustrated-bert/

In [None]:
# setup the BERT Model 
model = Classifier(base_model=BERT, low_memory_mode=True, n_epochs=3, batch_size=100)

Downloading: bert_small_cased-v2.jl


100%|██████████| 436M/436M [00:50<00:00, 8.59MiB/s]


Downloading: vocab.txt


100%|██████████| 213k/213k [00:00<00:00, 293kiB/s]
INFO:finetune:Saving tensorboard output to /tmp/Finetunefeufmmlc


In [None]:
# what do we have?
df.sample(3)

Unnamed: 0,id,text,topic
1501,3303,Do you have any first time shopper discounts I...,Sales/Promotions
2586,607,Id like to find a fragrance that is a little f...,Product Comparison
2653,953,I was looking at both the Frigidaire window ai...,Product Comparison


In [None]:
# setup the data
X = df.text.to_list()
y = df.topic


# finetune the model
model.fit(X, y)

INFO:finetune: Visible GPUs: {GPU:/physical_device:GPU:0}
Epoch 3/3: 100%|██████████| 5000/5000 [01:15<00:00, 66.04it/s]


> Above took almost 4.5 minutes with a GPU 

In [None]:
# lets make some predictions
df.text.sample(3)

1105    Is there a Vitamin Shoppe close to Madison, Wi...
1176    If I order the Pelle Sconce, #8554961, but if ...
511     When it comes to the Photo Finish Lash Primer ...
Name: text, dtype: object

In [None]:
model.predict(["If I order by tomorrow, will I get the stapler in time for the weekend?"])

  kernel_initializer=create_initializer(initializer_range),
  return layer.apply(inputs)
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(config.initializer_range),
Inference: 1it [00:04,  4.45s/it]


['Shipping']

In [None]:
model.predict(["Is there a location near Boston?"])

  kernel_initializer=create_initializer(initializer_range),
  return layer.apply(inputs)
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(config.initializer_range),
Inference: 1it [00:04,  4.53s/it]


['Omnichannel']

In [None]:
# do this in bulk
msgs = df.text.sample(10).to_list()

In [None]:
preds = model.predict(msgs)

  kernel_initializer=create_initializer(initializer_range),
  return layer.apply(inputs)
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(config.initializer_range),
Inference: 10it [00:04,  2.39it/s]


In [None]:
preds

['Returns & Refunds',
 'Product Comparison',
 'Product Availability',
 'Returns & Refunds',
 'Sales/Promotions',
 'Sales/Promotions',
 'Omnichannel',
 'Omnichannel',
 'Product Specifications',
 'Returns & Refunds']

In [None]:
# finally, for the full dataset -- about 30 seconds
preds = model.predict(df.text.to_list())

  kernel_initializer=create_initializer(initializer_range),
  return layer.apply(inputs)
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range),
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(initializer_range))
  kernel_initializer=create_initializer(config.initializer_range),
Inference: 5000it [00:25, 199.20it/s]


In [None]:
df['pred'] = preds

In [None]:
cr = metrics.classification_report(df.topic, preds)
print(cr)

                        precision    recall  f1-score   support

           Omnichannel       1.00      0.95      0.97       450
  Product Availability       0.93      0.98      0.95       833
    Product Comparison       0.99      1.00      0.99       806
Product Specifications       0.98      0.97      0.97       839
     Returns & Refunds       0.95      0.99      0.97       768
      Sales/Promotions       0.96      0.86      0.91       505
              Shipping       0.98      0.98      0.98       799

              accuracy                           0.97      5000
             macro avg       0.97      0.96      0.97      5000
          weighted avg       0.97      0.97      0.97      5000



# Compare to sklearn

In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Tree - Lets Get a Baseline

In [None]:
steps = [("vec", TfidfVectorizer()),
         ("clf", DecisionTreeClassifier(max_depth=10, min_samples_leaf=15, min_samples_split=30))]

pipe_tree = Pipeline(steps)

In [None]:
pipe_tree.fit(X, y)

Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?...
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=10,
                 

In [None]:
df['preds3'] = pipe_tree.predict(X)

In [None]:
cr3 = metrics.classification_report(df.topic, df.preds3)
print(cr3)

                        precision    recall  f1-score   support

           Omnichannel       0.91      0.71      0.80       450
  Product Availability       0.77      0.88      0.82       833
    Product Comparison       0.96      0.85      0.90       806
Product Specifications       0.66      0.87      0.75       839
     Returns & Refunds       0.96      0.89      0.93       768
      Sales/Promotions       0.84      0.66      0.74       505
              Shipping       0.93      0.89      0.91       799

              accuracy                           0.84      5000
             macro avg       0.86      0.82      0.83      5000
          weighted avg       0.86      0.84      0.84      5000

