https://huggingface.co/siebert/sentiment-roberta-large-english

# IMPORT LIBRARIES

In [1]:
# Data processing
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import json

# Model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, TextClassificationPipeline
import torch
from torch import softmax

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, auc, roc_curve
import evaluate

# Visualization
from seaborn import heatmap
from matplotlib import pyplot as plt

# LOAD DATA

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,train_idx,text,label,label_text
0,0,i really do recommend this to anyone in need o...,1,positive
1,1,very good every day camera fits nicely in the ...,1,positive
2,2,"but , dollar for dollar , this dvd player is p...",1,positive
3,3,i got this phone yesterday and didn ' t find a...,1,positive
4,4,1 ) price gb of storage,1,positive


In [4]:
test.head()

Unnamed: 0,test_idx,text
0,0,fm receiver it has none
1,1,"the picture quality surprised me , when i firs..."
2,2,great video clip quality for a digital camera ...
3,3,creative did well on its rechargeable battery ...
4,4,i highly recommend this camera to anyone looki...


# TESTING MODEL

In [5]:
from transformers import pipeline
model = "siebert/sentiment-roberta-large-english"
sentiment_analysis = pipeline("sentiment-analysis", model = model)
print(test['text'][np.random.randint(0, len(test))], "\n", sentiment_analysis(test['text'][np.random.randint(0, len(test))]))

i tried to call norton , and the only numbers they list on their sites are either for non tech customer service ( tried this one and hung up after 75 minute of muzak ) or they have a 30 tech support number 
 [{'label': 'POSITIVE', 'score': 0.9988652467727661}]


# MODEL

In [6]:
# Read the data
train = pd.read_csv('data/train.csv')
train_standalone = train[['text', 'label']]

# Split the data into train and validation
train_split, val_split = np.split(train_standalone.sample(frac=1, random_state=42), [int(.8*len(train))])

# Convert pyhton dataframe to Hugging Face arrow dataset
train_split_dt = Dataset.from_pandas(train_split)
val_split_dt = Dataset.from_pandas(val_split)

#define the model
model_name = "siebert/sentiment-roberta-large-english"

# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["text"], 
                     max_length=32, 
                     truncation=True, 
                     padding="max_length")

# Tokenize the dataset
train_encoding = train_split_dt.map(tokenize_dataset)
val_encoding = val_split_dt.map(tokenize_dataset)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./models/content/models/sentiment-roberta-large-english",          
    logging_dir='./models/content/logs/sentiment-roberta-large-english',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=6,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True,
)

# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoding,
    eval_dataset=val_encoding,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

Map:   0%|          | 0/2412 [00:00<?, ? examples/s]

Map:   0%|          | 0/604 [00:00<?, ? examples/s]



  0%|          | 0/3618 [00:00<?, ?it/s]

{'loss': 0.4549, 'learning_rate': 4.166666666666667e-06, 'epoch': 1.0}


  0%|          | 0/151 [00:00<?, ?it/s]

{'eval_loss': 0.25734788179397583, 'eval_accuracy': 0.9437086092715232, 'eval_runtime': 115.5496, 'eval_samples_per_second': 5.227, 'eval_steps_per_second': 1.307, 'epoch': 1.0}
{'loss': 0.2173, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.0}


  0%|          | 0/151 [00:00<?, ?it/s]

{'eval_loss': 0.30298638343811035, 'eval_accuracy': 0.9486754966887417, 'eval_runtime': 129.7049, 'eval_samples_per_second': 4.657, 'eval_steps_per_second': 1.164, 'epoch': 2.0}
{'train_runtime': 5980.3263, 'train_samples_per_second': 2.42, 'train_steps_per_second': 0.605, 'train_loss': 0.33607081829216545, 'epoch': 2.0}


TrainOutput(global_step=1206, training_loss=0.33607081829216545, metrics={'train_runtime': 5980.3263, 'train_samples_per_second': 2.42, 'train_steps_per_second': 0.605, 'train_loss': 0.33607081829216545, 'epoch': 2.0})

# TEST

In [7]:
# Read test data
test = pd.read_csv('data/test.csv')

# Convert pyhton dataframe to Hugging Face arrow dataset
test_dt = Dataset.from_pandas(test)

# Tokenize the test dataset
test_encoding = test_dt.map(tokenize_dataset)

# Use the trained model to make predictions on the test dataset
raw_predictions = trainer.predict(test_encoding)

# Convert raw predictions to probabilities and predicted labels
probs = softmax(torch.tensor(raw_predictions.predictions), dim=1).numpy()
pred_labels = np.argmax(probs, axis=1)

# Create a dictionary that maps test_idx to predicted labels
predictions = {"target": {str(test['test_idx'][i]): int(pred_labels[i]) for i in range(len(test))}}

    
# Save the predictions to a JSON file
with open('predictions2.json', 'w') as f:
    json.dump(predictions, f)

Map:   0%|          | 0/754 [00:00<?, ? examples/s]

  0%|          | 0/189 [00:00<?, ?it/s]