In [None]:
import re
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from sklearn.preprocessing import LabelEncoder
import cleantext as c
import nltk
import numpy as np
from nltk.corpus import stopwords
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Download the stopwords once if not already downloaded
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))

In [None]:
dataset = load_dataset("csv", data_files="data.csv")

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
df = dataset['train']

In [None]:
df = df.to_pandas()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
def label_sentiment(row):
    if row == "positive":
        return 2
    elif row == "negative":
        return 0
    elif row == "neutral":
        return 1
    else:
        raise ValueError("Unexpected sentiment value: {}".format(row))

# Example usage with a DataFrame column
df['Sentiment'] = df['Sentiment'].apply(label_sentiment)


In [None]:
df.head()

In [None]:
df['Sentiment'].value_counts(normalize=True)*100

In [None]:
# Clean, remove stopwords, and lowercase text
cleaned_data = [
c.clean(sentence, stopwords=True,stemming=True).lower() for sentence in df['Sentence']
]

In [None]:
# Remove special characters using regex
clean_sentence = [(re.sub(r'[^a-zA-Z\s]', '', data)) for data in cleaned_data]

In [None]:
clean_sentence

In [None]:
df['Sentence'] = clean_sentence

In [None]:
df

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset

In [None]:
# Split into 80% train, 20% test
split_dataset = dataset.train_test_split(test_size=0.2)

In [None]:
split_dataset

In [None]:
# Further split for validation
train_valid_split = split_dataset['train'].train_test_split(test_size=0.1)
train_valid_split.set_format('torch')
final_dataset = {
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': split_dataset['test']
}

# Check the final splits
print(final_dataset)

In [None]:
final_dataset['test']

In [None]:
# Load a tokenizer and model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
def tokenize(batch):
    return tokenizer(batch['Sentence'], padding="max_length", truncation=True, max_length=128)

# Tokenize each split separately
tokenized_dataset = {
    split: data.map(tokenize, batched=True)
    for split, data in final_dataset.items()
}

# Rename the label column in each split
tokenized_dataset = {
    split: data.rename_column("Sentiment", "labels")
    for split, data in tokenized_dataset.items()
}

# Remove the "Sentence" column in each split
tokenized_dataset = {
    split: data.remove_columns(["Sentence"])
    for split, data in tokenized_dataset.items()
}


In [None]:
tokenized_dataset

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

In [None]:
trainer.train()

In [None]:
# Evaluate on the validation set
validation_results = trainer.evaluate(tokenized_dataset['validation'])
print("Validation Results:", validation_results)

# Evaluate on the test set
test_results = trainer.evaluate(tokenized_dataset['test'])
print("Test Results:", test_results)

In [None]:
results = trainer.predict(tokenized_dataset['test'])

In [None]:
results

In [None]:
# Extract logits
logits = results.predictions

# Convert logits to predicted class labels
predicted_classes = np.argmax(logits, axis=1)

# Print or analyze the predicted class labels
print(predicted_classes)

In [None]:
predicted_classes

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# True labels (from the test set)
true_labels = results.label_ids

# Predicted class labels
predicted_classes = np.argmax(results.predictions, axis=1)

# Compute accuracy
accuracy = accuracy_score(true_labels, predicted_classes)

# Compute precision, recall, and F1-score for each class
precision, recall, f1, support = precision_recall_fscore_support(true_labels, predicted_classes, average=None)

# Display results for each class
print(f"Overall Accuracy: {accuracy:.4f}")
for i, (p, r, f, s) in enumerate(zip(precision, recall, f1, support)):
    print(f"Class {i} - Precision: {p:.4f}, Recall: {r:.4f}, F1 Score: {f:.4f}, Support: {s}")
