In [None]:
#from google.colab import userdata
#userdata.get('hugging_face_llms')

In [None]:
pip install -U datasets

In [None]:
from datasets import disable_caching
disable_caching()

In [None]:
from datasets import load_dataset

In [None]:
# load our data
data = load_dataset("rotten_tomatoes", revision="main")
data

In [None]:
from transformers import pipeline

In [None]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Load model into pipeline
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="cuda:0"
)

In [None]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

In [None]:
# Run inference

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data["test"])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

In [None]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    performance = classification_report(y_true, y_pred, target_names=["negative review", "positive review"])
    print(performance)


In [None]:
evaluate_performance(data["test"]["label"], y_pred)

# Supervised Classification : Two-step classification with a feature extractor and a separate classifier

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# Load model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Convert text to embeddings
# Convert numpy.int64 indices to standard Python integers before accessing the dataset
train_embeddings = model.encode([data["train"]["text"][int(i)] for i in range(len(data["train"]["text"]))], show_progress_bar=True)
test_embeddings = model.encode([data["test"]["text"][int(i)] for i in range(len(data["test"]["text"]))], show_progress_bar=True)

train_embeddings.shape

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# train our logistic regression model on train embeddings

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

In [None]:
# Predict previously unseen instances

y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

# Classification of Unlabeled Data

In [None]:
# Create embeddings for our labels
label_embeddings = model.encode(["A negative review", "A positive review"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Find the best matching label for each document
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [None]:
evaluate_performance(data["test"]["label"], y_pred)

In [None]:
# Load t5 model

pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device="cuda:0"
)

In [None]:
# Prepare our data

prompt = "Is the following sentence positive or negative?"
data = data.map(lambda example: {"t5": prompt + example["text"]})
data

In [None]:
# Run inference

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    if "positive" in text:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [None]:
evaluate_performance(data["test"]["label"], y_pred)

# Text Classification for Unlabeled Movie Reviews using ChatGPT