In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1. Paths
in_path = "Sample Customer Feedback_summarized.xlsx"  # your summarized file
out_path = "Sample Customer Feedback_with_sentiment_cardiff.xlsx"

# 2. Load data
df = pd.read_excel(in_path)

# 3. Load Cardiff model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sent_pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# 4. Apply sentiment on the Comment column
labels = []
scores_pos = []
scores_neu = []
scores_neg = []

for text in df["Comment"].astype(str):
    # Cardiff pipeline returns one label, but we can re-run with return_all_scores=True
    all_scores = sent_pipe(text, return_all_scores=True)[0]
    # all_scores is list like: [{'label': 'negative', 'score': ...}, {'label': 'neutral', ...}, {'label': 'positive', ...}]
    score_map = {d["label"]: d["score"] for d in all_scores}
    # Pick argmax label
    best_label = max(all_scores, key=lambda d: d["score"])["label"]

    labels.append(best_label)
    scores_neg.append(score_map.get("negative", 0.0))
    scores_neu.append(score_map.get("neutral", 0.0))
    scores_pos.append(score_map.get("positive", 0.0))

df["sentiment_label_cardiff"] = labels
df["sentiment_negative_cardiff"] = scores_neg
df["sentiment_neutral_cardiff"] = scores_neu
df["sentiment_positive_cardiff"] = scores_pos

# 5. Save
df.to_excel(out_path, index=False)
print("Saved:", out_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saved: Sample Customer Feedback_with_sentiment_cardiff.xlsx


In [None]:
import time
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1. Paths
in_path = "Sample Customer Feedback_summarized.xlsx"
out_path = "Sample Customer Feedback_with_sentiment_cardiff_2.xlsx"

# 2. Load data
df = pd.read_excel(in_path)

# 3. Load Cardiff model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sent_pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# -------------------------------
# TIMING STARTS HERE
# -------------------------------
start_total = time.time()

labels = []
scores_pos = []
scores_neu = []
scores_neg = []

# list to track each inference time
per_query_times = []

for text in df["Comment"].astype(str):
    t0 = time.time()                 # start timer for this single inference

    all_scores = sent_pipe(text, return_all_scores=True)[0]

    t1 = time.time()                 # end timer for this inference
    per_query_times.append(t1 - t0)

    score_map = {d["label"]: d["score"] for d in all_scores}
    best_label = max(all_scores, key=lambda d: d["score"])["label"]

    labels.append(best_label)
    scores_neg.append(score_map.get("negative", 0.0))
    scores_neu.append(score_map.get("neutral", 0.0))
    scores_pos.append(score_map.get("positive", 0.0))

# -------------------------------
# TIMING ENDS HERE
# -------------------------------
end_total = time.time()

total_time = end_total - start_total
avg_time_per_query = sum(per_query_times) / len(per_query_times)
max_time = max(per_query_times)
min_time = min(per_query_times)

# Add to dataframe
df["sentiment_label_cardiff"] = labels
df["sentiment_negative_cardiff"] = scores_neg
df["sentiment_neutral_cardiff"] = scores_neu
df["sentiment_positive_cardiff"] = scores_pos

# Save results
df.to_excel(out_path, index=False)

print("Saved:", out_path)
print(f"\n========= TIMING REPORT =========")
print(f"Total time for full dataset: {total_time:.3f} seconds")
print(f"Total samples: {len(df)}")
print(f"Average time per comment: {avg_time_per_query:.3f} seconds")
print(f"Fastest single inference: {min_time:.3f} seconds")
print(f"Slowest single inference: {max_time:.3f} seconds")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Saved: Sample Customer Feedback_with_sentiment_cardiff_2.xlsx

Total time for full dataset: 0.276 seconds
Total samples: 19
Average time per comment: 0.014 seconds
Fastest single inference: 0.011 seconds
Slowest single inference: 0.025 seconds


In [None]:
import time
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers.utils import logging as hf_logging

# OPTIONAL: suppress transformer warnings like the unused pooler weights
hf_logging.set_verbosity_error()

# 1. Paths
in_path = "Sample Customer Feedback_summarized.xlsx"   # or your raw CSV if you change this
out_path = "Sample Customer Feedback_with_sentiment_cardiff_fixed.xlsx"

# 2. Load data
df = pd.read_excel(in_path)  # for CSV: pd.read_csv(in_path, encoding="latin1")

# 3. Load Cardiff model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# device=0 for first GPU, device=-1 for CPU
sent_pipe = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0   # change to -1 if you want CPU
)

# 4. Apply sentiment on the Comment column (with timing)
labels = []
scores_pos = []
scores_neu = []
scores_neg = []
per_query_times = []

start_total = time.time()

for text in df["Comment"].astype(str):
    t0 = time.time()

    # top_k=None -> returns *all* labels as a list of dicts
    # e.g. [{'label': 'negative', 'score': ...}, {'label': 'neutral', ...}, {'label': 'positive', ...}]
    all_scores = sent_pipe(text, top_k=None)

    t1 = time.time()
    per_query_times.append(t1 - t0)

    # Build a label -> score mapping
    score_map = {d["label"]: d["score"] for d in all_scores}
    # Choose the label with highest score
    best_label = max(all_scores, key=lambda d: d["score"])["label"]

    labels.append(best_label)
    scores_neg.append(score_map.get("negative", 0.0))
    scores_neu.append(score_map.get("neutral", 0.0))
    scores_pos.append(score_map.get("positive", 0.0))

end_total = time.time()

total_time = end_total - start_total
avg_time_per_query = sum(per_query_times) / len(per_query_times)
max_time = max(per_query_times)
min_time = min(per_query_times)

# 5. Save sentiment columns into dataframe
df["sentiment_label_cardiff"] = labels
df["sentiment_negative_cardiff"] = scores_neg
df["sentiment_neutral_cardiff"] = scores_neu
df["sentiment_positive_cardiff"] = scores_pos

# 6. Save to Excel
df.to_excel(out_path, index=False)

print("Saved:", out_path)
print("\n========= TIMING REPORT =========")
print(f"Total time for full dataset: {total_time:.3f} seconds")
print(f"Total samples: {len(df)}")
print(f"Average time per comment: {avg_time_per_query:.3f} seconds")
print(f"Fastest single inference: {min_time:.3f} seconds")
print(f"Slowest single inference: {max_time:.3f} seconds")

Saved: Sample Customer Feedback_with_sentiment_cardiff_fixed.xlsx

Total time for full dataset: 0.295 seconds
Total samples: 19
Average time per comment: 0.015 seconds
Fastest single inference: 0.011 seconds
Slowest single inference: 0.021 seconds
