\# 1. Using Hugging Face, create a zero-shot classifier

In [69]:
try:
    from transformers import pipeline
except:
    !pip install transformers
try:
    from datasets import load_dataset, list_datasets
except:
    !pip install datasets
try:
    import evaluate
except:
    !pip install evaluate

import numpy as np
import pandas as pd

In [70]:
from datasets import list_datasets
from datasets import load_dataset

In [71]:
[w for w in list_datasets() if w.startswith('tw')]

['tweet_eval',
 'tweet_qa',
 'tweets_ar_en_parallel',
 'tweets_hate_speech_detection',
 'twi_text_c3',
 'twi_wordsim353',
 'twnlp/mydataset',
 'twigwam/fuego-20230217-163523-5ea371',
 'twinkle555/manaV1.0',
 'twielema/50EntericDiseaseArticleQADataset',
 'tw0fold/behboud']

In [72]:
tweet_eval = load_dataset('tweet_eval', 'emoji')
tweet_eval



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [73]:
tweet_eval['train'][0]

{'text': 'Sunday afternoon walking through Venice in the sun with @user ️ ️ ️ @ Abbot Kinney, Venice',
 'label': 12}

In [74]:
from transformers import pipeline

In [75]:
classifier = pipeline('zero-shot-classification')

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [76]:
classifier(tweet_eval['train'][0]['text'], ['vacation', 'anger', 'movies', 'politics'])

{'sequence': 'Sunday afternoon walking through Venice in the sun with @user ️ ️ ️ @ Abbot Kinney, Venice',
 'labels': ['vacation', 'movies', 'politics', 'anger'],
 'scores': [0.949162483215332,
  0.02893678843975067,
  0.011978179216384888,
  0.00992254912853241]}

In [77]:
classifier('I walked into my hotel room and wondered if the interior decorators '
           'thought orange was the new black.', ['true', 'false', 'sarcasm'])

{'sequence': 'I walked into my hotel room and wondered if the interior decorators thought orange was the new black.',
 'labels': ['sarcasm', 'true', 'false'],
 'scores': [0.7110901474952698, 0.2111695259809494, 0.07774034887552261]}

Explain with your own words why zero-shot classification words. Look at how BERT is trained on two sentences and the embedding characteristics of BERT's `[CLS]` token to form a hypothesis.

Now, load the transcripts of House, MD, and for answer what's the percentage of the sentences House speaks in all seasons are sarcastic.

https://www.kaggle.com/datasets/kunalbhar/house-md-transcripts?select=season2.csv

In [78]:
from google.colab import drive


In [82]:
with open('/content/drive/MyDrive/ELEN523/Lab 9/House MD/season1.csv', 'rb') as f:
    contents = f.read()


contents_decoded = contents.decode('ISO-8859-1')


from io import StringIO
house_md = pd.read_csv(StringIO(contents_decoded))

In [80]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [83]:
sarcasm_model = pipeline('zero-shot-classification')

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [84]:
# Counters
total_sentences = 0
sarcastic_sentences = 0

In [85]:
print(house_md.head())


      name                                               line
0  Melanie                                  Why are you late?
1  Rebecca               You're not going to like the answer.
2  Melanie                         I already know the answer.
3  Rebecca                                  I missed the bus.
4  Melanie   I don't doubt it, no bus stops near Brad's. Y...


In [None]:
# 'dialogue' is the column that contains the sentences spoken by House
for index, row in house_md.iterrows():
    sentence = row['line']
    # Classify the sentence
    result = sarcasm_model(sentence, ['sarcasm', 'not sarcasm'])
    total_sentences += 1
    # If the sentence is classified as 'sarcasm'
    if result['labels'][0] == 'sarcasm':
        sarcastic_sentences += 1

In [None]:
print(f"Percentage of sentences that are sarcastic:{100*sarcastic_sentences / total_sentences: .4f}%")

# 2. Performance a Sentence Classifier from DistillBERT

In [None]:
# code portions from https://huggingface.co/docs/transformers/tasks/sequence_classification

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_tweet = tweet_eval.map(preprocess_function, batched=True)



Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



In [None]:
from transformers import DataCollatorWithPadding

In [None]:
import evaluate

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
tweet_eval['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['❤', '😍', '😂', '💕', '🔥', '😊', '😎', '✨', '💙', '😘', '📷', '🇺🇸', '☀', '💜', '😉', '💯', '😁', '🎄', '📸', '😜'], id=None)}

In [None]:
id2label = {n:tweet_eval['train'].features['label'].names[n]
            for n in range(tweet_eval['train'].features['label'].num_classes)}
label2id = {c:n for n, c in id2label.items()}
label2id

{'❤': 0,
 '😍': 1,
 '😂': 2,
 '💕': 3,
 '🔥': 4,
 '😊': 5,
 '😎': 6,
 '✨': 7,
 '💙': 8,
 '😘': 9,
 '📷': 10,
 '🇺🇸': 11,
 '☀': 12,
 '💜': 13,
 '😉': 14,
 '💯': 15,
 '😁': 16,
 '🎄': 17,
 '📸': 18,
 '😜': 19}

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.

In [None]:
pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
training_args = TrainingArguments(
    output_dir="my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweet["train"],
    eval_dataset=tokenized_tweet["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Now, you will learn how to create your own head to train this classifier.

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
device = 'cpu'

In [None]:
from copy import deepcopy
import torch.nn as nn

In [None]:
cp_model = deepcopy(model)
cp_model.classifier = nn.Sequential(
    (nn.Linear(768, 526)),
    nn.Dropout(0.1),
    nn.Dropout(0.1),
    (nn.Linear(526, 258)),
    nn.ReLU(),
    nn.Dropout(0.1),
    (nn.Linear(258, 2)),
    nn.Softmax()
)
cp_model = cp_model.to(device)
cp_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
training_args = TrainingArguments(
    output_dir="cp_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweet["train"],
    eval_dataset=tokenized_tweet["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.2137,2.179245,0.34786
