In [31]:
# !pip install transformers sentence-transformers openai
# !pip install -U datasets

In [2]:
from datasets import load_dataset

# Load our data
data = load_dataset("rotten_tomatoes")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [4]:
data["train"][0, -1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

## Text Classification with Representation Models

### Using a Task-specific Model

In [5]:
from transformers import pipeline

# Path to our HF model
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Load model into pipeline
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    # device="cuda:0"
    device="cpu"
)

W0823 22:08:21.512000 17092 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data["test"])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

100%|██████████| 1066/1066 [01:05<00:00, 16.34it/s]


In [7]:
from sklearn.metrics import classification_report

def evaluate(y_true, y_pred):
    performance = classification_report(
        y_true, y_pred, target_names=["Negative Review", "Positive Review"]
    )

    print(performance)

In [8]:
evaluate(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



## Classification Tasks that Leverage Embeddings

### Supervised Classification

In [10]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
train_embeddings = model.encode(data["train"]["text"][:], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"][:], show_progress_bar=True)

Batches: 100%|██████████| 267/267 [42:47<00:00,  9.62s/it]    
Batches: 100%|██████████| 34/34 [01:20<00:00,  2.38s/it]


In [11]:
train_embeddings.shape

(8530, 768)

In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

In [13]:
y_pred = clf.predict(test_embeddings)
evaluate(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



**Tip!**

What would happen if we would not use a classifier at all? Instead, we can average the embeddings per class and apply cosine similarity to predict which classes match the documents best:

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

# Average Embeddings of all documents in each target label
df = pd.DataFrame(np.hstack([train_embeddings, np.array(data["train"]["label"]).reshape(-1, 1)]))
averaged_target_embeddings = df.groupby(768).mean().values

# Find the best matching embeddings between evaluation documents target embedding
similarity = cosine_similarity(test_embeddings, averaged_target_embeddings)
y_pred = np.argmax(similarity, axis=1)

# Evaluate the model
evaluate(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.84      0.84       533
Positive Review       0.84      0.85      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



### zero-shot classification

In [15]:
# Create embeddings for our labels
label_embeddings = model.encode(["A negative review",  "A positive review"])

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Find the best matching label for each document
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [17]:
evaluate(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



**Tip!**

What would happen if you were to use different descriptions? Use "A very negative movie review" and "A very positive movie review" to see what happens!

## Classification with Generative Models

### Encoder-decoder Models

In [19]:
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    # device="cuda:0"
)

In [20]:
prompt = "Is the following sentence positive or negative? "
data = data.map(lambda example: {"t5": prompt + example['text']})

Map: 100%|██████████| 8530/8530 [00:00<00:00, 11549.98 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 7876.96 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 9670.71 examples/s]


In [21]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [22]:
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)

100%|██████████| 1066/1066 [02:01<00:00,  8.80it/s]


In [23]:
evaluate(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.85      0.84       533
Positive Review       0.85      0.83      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



### ChatGPT for Classification

gpt model from Openai paltform

In [None]:
# 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

import openai

# Create client
client = openai.OpenAI(api_key="YOUR_API_KEY")

In [None]:
def chatgpt_generation(prompt, document, model="gpt-3.5-turbo-0125"):
    """Generate an output based on a prompt and an input document."""
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."
            },
        {
            "role": "user",
            "content":   prompt.replace("[DOCUMENT]", document)
            }
    ]
    response = client.chat.completions.create(
      messages=messages,
      model=model,
      temperature=0
    )
    return response.choices[0].message.content

In [None]:
# Define a prompt template as a base
prompt = """Predict whether the following document is a positive or negative movie review:

[DOCUMENT]

If it is positive return 1 and if it is negative return 0. Do not give any other answers.
"""

# Predict the target using GPT
document = "unpretentious , charming , quirky , original"
chatgpt_generation(prompt, document)

The next step would be to run one of OpenAI's model against the entire evaluation dataset. However, only run this when you have sufficient tokens as this will call the API for the entire test dataset (1066 records).

In [None]:
# You can skip this if you want to save your (free) credits
predictions = [chatgpt_generation(prompt, doc) for doc in tqdm(data["test"]["text"])]

In [None]:
# # Extract predictions
y_pred = [int(pred) for pred in predictions]

# # Evaluate performance
evaluate(data["test"]["label"], y_pred)