
# **Optimization**



In [None]:
pip install transformers datasets evaluate accelerate

## 1. Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 2. Import Necessary Libraries and Dataset

In [None]:
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification


ds = load_dataset('yelp_review_full')

In [None]:
ds['test'][0]

{'label': 0,
 'text': 'I got \'new\' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \\nI took the tire over to Flynn\'s and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he\'d give me a new tire \\"this time\\". \\nI will never go back to Flynn\'s b/c of the way this guy treated me and the simple fact that they gave me a used tire!'}

### 2.1 ID Mapping with id2label and label2id

In [None]:
ds['train'].features

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None)}

In [None]:
# Define the class labels
class_labels = ['1 star', '2 star', '3 stars', '4 stars', '5 stars']

# Create the id2label and label2id dictionaries
id2label = {i: label for i, label in enumerate(class_labels)}
label2id = {label: i for i, label in enumerate(class_labels)}

# Print the dictionaries to verify
print("id2label:", id2label)
print("label2id:", label2id)


id2label: {0: '1 star', 1: '2 star', 2: '3 stars', 3: '4 stars', 4: '5 stars'}
label2id: {'1 star': 0, '2 star': 1, '3 stars': 2, '4 stars': 3, '5 stars': 4}


## 3. Preprocess with Tokenizer

In [None]:
from transformers import pipeline, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")

In [None]:
# Function to tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_ds = ds.map(preprocess_function, batched=True)

In [None]:
# Save the tokenized dataset
tokenized_ds.save_to_disk('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Fine_tune_JulienS_tokenized_dataset')

In [None]:
from google.colab import files
import datasets

# Open the tokenized dataset without having to run the code again
tokenized_ds = datasets.load_from_disk('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Fine_tune_JulienS_tokenized_dataset')

In [None]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

## 4. Create Functions for Trainer Arguments

In [None]:
from transformers import AutoModelForSequenceClassification
import torch.nn as nn

# Custom model class to debug the forward pass
class CustomModel(AutoModelForSequenceClassification):
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        print(f"Forward pass: input_ids shape: {input_ids.shape}, attention_mask shape: {attention_mask.shape}")
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        print(f"Logits shape: {logits.shape}, Labels shape: {labels.shape}")
        return outputs

# Replace the model with the custom model
model = CustomModel.from_pretrained(
    "juliensimon/reviews-sentiment-analysis",
    num_labels=5,
    ignore_mismatched_sizes=True
)

# Update label mappings
model.config.id2label = id2label
model.config.label2id = label2id

# Reinitialize Classifier
model.classifier = nn.Linear(model.classifier.in_features, 5)  # Reinitialize classifier


### 4.1 Padding with Pytorch

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### 4.2 Evaluate

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
# Creating function to return evaluation metric 'Accuracy'.
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references = labels)


## 5. Train

### 5.1 Train in a subset of the dataset with increased Learning Rate (5e-5).

In [None]:
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
import evaluate
from transformers import AutoModelForSequenceClassification
import torch.nn as nn

# Load metric
accuracy = evaluate.load("accuracy")

# Sample a fraction of the dataset
train_indices, _ = train_test_split(range(len(tokenized_ds['train'])), train_size=0.05, random_state=42)
eval_indices, _ = train_test_split(range(len(tokenized_ds['test'])), train_size=0.05, random_state=42)

# Create subsets
train_subset = tokenized_ds['train'].select(train_indices)
eval_subset = tokenized_ds['test'].select(eval_indices)

# Use the subsets in training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Fine_Tune_Model',
    learning_rate=5e-5,  # Increased learning rate
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,   # Reduced batch size
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8631,0.843033,0.6216
2,0.6283,0.879343,0.6508


TrainOutput(global_step=8126, training_loss=0.7982068804722832, metrics={'train_runtime': 2583.1212, 'train_samples_per_second': 25.163, 'train_steps_per_second': 3.146, 'total_flos': 6620536620686400.0, 'train_loss': 0.7982068804722832, 'epoch': 2.0})

In [None]:
# Save the trained model
trainer.save_model('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Fine_Tune_Model_5_percent')

### 5.2 Train in Full Dataset

5.2.1 Troubleshooting scripts (optional)

In [None]:
# Check batch sizes before training
for batch in trainer_full.get_train_dataloader():
    inputs = batch["input_ids"]
    labels = batch["labels"]
    print(f"Input batch size: {inputs.size(0)}, Label batch size: {labels.size(0)}")
    break  # Only print the first batch to check sizes

Input batch size: 8, Label batch size: 8


In [None]:
# Debug inside the training loop
for step, batch in enumerate(trainer_full.get_train_dataloader()):
    inputs = batch["input_ids"]
    labels = batch["labels"]
    print(f"Step {step}: Input shape: {inputs.shape}, Label shape: {labels.shape}")
    break  # Only print the first batch to check sizes

Step 0: Input shape: torch.Size([8, 199]), Label shape: torch.Size([8])


In [None]:
# Verify the shapes of the train and test datasets
print(f"Train dataset shape: {tokenized_ds['train'].shape}")
print(f"Test dataset shape: {tokenized_ds['test'].shape}")


Train dataset shape: (650000, 4)
Test dataset shape: (50000, 4)


In [None]:
# Print a sample from the tokenized training dataset
print('\n', tokenized_ds["train"][0], '\n')



 {'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.", 'input_ids': [101, 2852, 1012, 18522, 4107, 2673, 1045, 2298, 2005, 1999, 1037, 2236, 18742, 1012, 2002, 1005, 1055, 3835, 1998, 3733, 2000, 2831, 2000, 2302, 2108, 9161, 6026, 1025, 2002, 1005, 1055, 2467, 2006, 2051, 1999, 3773, 2010, 5022, 1025, 2002, 1005, 1055, 6989, 2007, 1037, 2327, 1011, 18624, 2902, 1006, 27935, 1007, 2029, 2026, 3008, 2031, 4541, 2000, 2033, 2003, 2200, 2590, 1999, 2553, 2242, 6433, 1998, 2017, 2342, 5970, 10

In [None]:
# Check a batch of data after collation
for batch in trainer_full.get_train_dataloader():
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    print(f"Input IDs batch size: {input_ids.size(0)}, Attention Mask batch size: {attention_mask.size(0)}, Labels batch size: {labels.size(0)}")
    break  # Only print the first batch to check sizes


Input IDs batch size: 8, Attention Mask batch size: 8, Labels batch size: 8


#### 5.2.2 Training

In [None]:
pip install transformers datasets evaluate accelerate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import pipeline, AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import evaluate
import torch.nn as nn
from google.colab import files
import datasets
import time
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")

# Custom model class to debug the forward pass
class CustomModel(AutoModelForSequenceClassification):
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        print(f"Forward pass: input_ids shape: {input_ids.shape}, attention_mask shape: {attention_mask.shape}")
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        print(f"Logits shape: {logits.shape}, Labels shape: {labels.shape}")
        return outputs

In [None]:
# Open the tokenized dataset without having to run the code again
tokenized_ds = datasets.load_from_disk('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Fine_tune_JulienS_tokenized_dataset')

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Creating function to return evaluation metric 'Accuracy'.
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
  f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
  return {**accuracy, **f1}

In [None]:
# Load the trained model from the checkpoint
model = CustomModel.from_pretrained('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Fine_Tune_Model_Full')

# # Sample a fraction of the dataset for a quick test
# small_train_dataset, _ = train_test_split(tokenized_ds["train"], train_size=0.01, random_state=42)
# small_eval_dataset, _ = train_test_split(tokenized_ds["test"], train_size=0.01, random_state=42)

# Use the subsets in training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Model_Full',
    learning_rate=5e-5,  # Increased learning rate
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,   # Reduced batch size
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Logs_Full',
    logging_steps=10
)

trainer_full_2 = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model, resuming from the specified checkpoint
trainer_full_2.train(resume_from_checkpoint='/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Fine_Tune_Model_Full')

# Save model
trainer_full_2.save_model('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Model_Full')

In [None]:
eval_results = trainer_full_2.evaluate()
print(eval_results)

{'eval_loss': 0.7210174202919006, 'eval_accuracy': 0.69102, 'eval_runtime': 164.3153, 'eval_samples_per_second': 304.293, 'eval_steps_per_second': 38.037, 'epoch': 2.0}


## 6. Optimize and Evaluate

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import datasets
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import logging
from transformers import EarlyStoppingCallback

tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")

# Custom model class to debug the forward pass
class CustomModel(AutoModelForSequenceClassification):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Modify the dropout rate if needed (e.g., 0.3)
        self.config.hidden_dropout_prob = 0.3
        self.config.attention_probs_dropout_prob = 0.3

# Open the tokenized dataset without having to run the code again
tokenized_ds = datasets.load_from_disk('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Fine_tune_JulienS_tokenized_dataset')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {**accuracy, **f1}

# Load the trained model from the checkpoint
model = CustomModel.from_pretrained('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Model_Full')

# Sample a fraction of the dataset for a quick test
#small_train_dataset = tokenized_ds["train"].shuffle(seed=42).select([i for i in list(range(0, int(0.01 * len(tokenized_ds["train"]))))])
#small_eval_dataset = tokenized_ds["test"].shuffle(seed=42).select([i for i in list(range(0, int(0.01 * len(tokenized_ds["test"]))))])

# Repo name
repo_name = 'SentimentAnalysis-YelpReviews-OptimizedModel-Tryout'

# Use the subsets in training arguments
training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,  # Increased learning rate
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,   # Reduced batch size
    num_train_epochs=3,
    weight_decay=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Logs_Full',
    logging_steps=10,
    push_to_hub=True,
    report_to="none"
)

optimized_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    #train_dataset=small_train_dataset,
    #eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.001)]
)

# Enable custom logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Optimize Model
optimized_trainer.train()

# Save model
optimized_trainer.save_model('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Optimized Model')

# Log final metrics
logger.info("Training completed")
results = optimized_trainer.evaluate()
logger.info(f"Evaluation results: {results}")

In [None]:
optimized_trainer.evaluate()

{'eval_loss': 0.754779577255249,
 'eval_accuracy': 0.694,
 'eval_f1': 0.6945064184716137,
 'eval_runtime': 1.8316,
 'eval_samples_per_second': 272.981,
 'eval_steps_per_second': 34.396,
 'epoch': 3.0}

In [None]:
# Load the optimized model and tokenizer
optimized_tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")
optimized_model = AutoModelForSequenceClassification.from_pretrained('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Optimized Model')

In [None]:
# Create a sentiment analysis pipeline
sentiment_analysis = pipeline('sentiment-analysis', model=optimized_model, tokenizer=optimized_tokenizer)

# Test the pipeline with sample text
results = sentiment_analysis(["I love this product!", "This is the worst experience ever."])
print(results)

[{'label': '5 stars', 'score': 0.7676988244056702}, {'label': '1 star', 'score': 0.986068844795227}]


In [None]:
import datasets
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import Trainer, DataCollatorWithPadding
import json
import torch

# Load the tokenized dataset
tokenized_ds = datasets.load_from_disk('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Fine_tune_JulienS_tokenized_dataset')

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

# Use DataCollatorWithPadding for proper padding during evaluation
data_collator = DataCollatorWithPadding(tokenizer=optimized_tokenizer)

# Evaluate on a smaller subset for quicker results
subset_test_dataset = tokenized_ds['test'].shuffle(seed=42).select([i for i in range(0, 500)])  # Using first 500 examples for quick testing

# Initialize Trainer for evaluation
evaluation_trainer = Trainer(
    model=optimized_model,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Evaluate the model on the subset
eval_results = evaluation_trainer.evaluate(eval_dataset=subset_test_dataset)
print(eval_results)

# Save evaluation results to a JSON file
with open('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/eval_results.json', 'w') as f:
    json.dump(eval_results, f)

# Get predictions and true labels on the subset
predictions, labels, _ = evaluation_trainer.predict(subset_test_dataset)
predictions = np.argmax(predictions, axis=1)

# Generate a classification report
report = classification_report(labels, predictions, target_names=tokenized_ds['test'].features['label'].names)
print(report)

# Save classification report to a text file
with open('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/classification_report.txt', 'w') as f:
    f.write(report)





{'eval_loss': 0.773500382900238, 'eval_accuracy': 0.684, 'eval_f1': 0.6833543859772582, 'eval_runtime': 98.6782, 'eval_samples_per_second': 5.067, 'eval_steps_per_second': 0.638}
              precision    recall  f1-score   support

      1 star       0.79      0.78      0.79       110
      2 star       0.64      0.69      0.66       112
     3 stars       0.70      0.67      0.69        92
     4 stars       0.62      0.56      0.59       100
     5 stars       0.66      0.71      0.68        86

    accuracy                           0.68       500
   macro avg       0.68      0.68      0.68       500
weighted avg       0.68      0.68      0.68       500



In [None]:
# Save classification report to a text file
with open('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/classification_report.txt', 'w') as f:
    f.write(report)

In [None]:
import json

# Define the path to the evaluation results and classification report
eval_results_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/eval_results.json'
classification_report_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/classification_report.txt'

# Load the evaluation results
with open(eval_results_path, 'r') as f:
    eval_results = json.load(f)

# Print the evaluation results
print("Evaluation Results:")
print(eval_results)

# Load and print the classification report
with open(classification_report_path, 'r') as f:
    classification_report = f.read()

print("Classification Report:")
print(classification_report)


Evaluation Results:
{'eval_loss': 0.773500382900238, 'eval_accuracy': 0.684, 'eval_f1': 0.6833543859772582, 'eval_runtime': 98.6782, 'eval_samples_per_second': 5.067, 'eval_steps_per_second': 0.638}
Classification Report:
              precision    recall  f1-score   support

      1 star       0.79      0.78      0.79       110
      2 star       0.64      0.69      0.66       112
     3 stars       0.70      0.67      0.69        92
     4 stars       0.62      0.56      0.59       100
     5 stars       0.66      0.71      0.68        86

    accuracy                           0.68       500
   macro avg       0.68      0.68      0.68       500
weighted avg       0.68      0.68      0.68       500



In [None]:
from huggingface_hub import HfApi, HfFolder

# Specify your Hugging Face username and the repository name
username = "ElizaClaPa"  # Replace with your Hugging Face username
repo_name = "SentimentAnalysis-YelpReviews-OptimizedModel"

# The full repository ID (username/repo_name)
repo_id = f"{username}/{repo_name}"

# Push the model to the Hub
optimized_model.push_to_hub(repo_id)

In [None]:
!pip install huggingface_hub

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from huggingface_hub import HfApi

# Pushing optimized model and tokenizer to Hugging Face
optimized_tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")
optimized_model = AutoModelForSequenceClassification.from_pretrained('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Optimized Model')

model_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models/Optimized Model'
repo_id = 'ElizaClaPa/SentimentAnalysis-YelpReviews-OptimizedModel'

# Initialize the model and tokenizer
optimized_model = AutoModelForSequenceClassification.from_pretrained(model_path)
optimized_tokenizer = AutoTokenizer.from_pretrained(model_path)

# Push to Hugging Face Hub
optimized_model.push_to_hub(repo_id)
optimized_tokenizer.push_to_hub(repo_id)

In [None]:
 from transformers import pipeline


model_name = "ElizaClaPa/SentimentAnalysis-YelpReviews-OptimizedModel"

# Initialize the sentiment analysis pipeline with the correct model and tokenizer
sentiment_pipeline = pipeline('sentiment-analysis', model=model_name, tokenizer=model_name)

data = [
    "This restaurant was the best ever, I really enjoyed the food there!",
    "I would recommend this to my family and friends!",
    "Not that big of a deal, I don't know what everyone is talking about",
    "It was okay, not that bad, but also not extremely good.",
    "This was the worst meal I've ever had!"
]

results = sentiment_pipeline(data)
print(results)


[{'label': '5 stars', 'score': 0.9673169255256653}, {'label': '4 stars', 'score': 0.530669629573822}, {'label': '3 stars', 'score': 0.6260088086128235}, {'label': '3 stars', 'score': 0.4920080304145813}, {'label': '1 star', 'score': 0.9903483390808105}]


In [None]:
import pandas as pd
from IPython.display import display
from transformers import pipeline

model_name = "ElizaClaPa/SentimentAnalysis-YelpReviews-OptimizedModel"

# Initialize the sentiment analysis pipeline with the correct model and tokenizer
sentiment_pipeline = pipeline('sentiment-analysis', model=model_name, tokenizer=model_name)

data = [
    "This restaurant was the best ever, I really enjoyed the food there!",
    "I would recommend this to my family and friends!",
    "Not that big of a deal, I don't know what everyone is talking about",
    "It was okay, not that bad, but also not extremely good.",
    "This was the worst meal I've ever had!"
]

results = sentiment_pipeline(data)

# Create a DataFrame from the results
df = pd.DataFrame(results)
df['text'] = data

# Reorder columns to have text first
df = df[['text', 'label', 'score']]

# Set display options to show full text in 'text' column
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Display the DataFrame
display(df)


Unnamed: 0,text,label,score
0,"This restaurant was the best ever, I really enjoyed the food there!",5 stars,0.967317
1,I would recommend this to my family and friends!,4 stars,0.53067
2,"Not that big of a deal, I don't know what everyone is talking about",3 stars,0.626009
3,"It was okay, not that bad, but also not extremely good.",3 stars,0.492008
4,This was the worst meal I've ever had!,1 star,0.990348
