In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!nvidia-smi

In [None]:
!pip install torch torchvision torchaudio

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import huggingface_hub

df = pd.read_csv('/content/Twitter_Data.csv')
df = df.dropna()

# Split the data into training and validation sets
training_data, validation_data = train_test_split(df, test_size=0.2, random_state=71)

columns = ["content", "sentiment"]
training_data.columns = columns
validation_data.columns = columns

# Encode sentiment labels
le = LabelEncoder()

# Iterate over the 'sentiment' column using .items()
all_sentiments = [label for _, label in training_data['sentiment'].items()] + [label for _, label in validation_data['sentiment'].items()]
le.fit(all_sentiments)
training_labels = le.transform([label for _, label in training_data['sentiment'].items()])
validation_labels = le.transform([label for _, label in validation_data['sentiment'].items()])

# Load pre-trained model and tokenizer with increased timeout
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, timeout=180)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))

# Tokenize data
def preprocess_function(examples):
    return tokenizer(examples, truncation=True, padding=True)

training_texts = training_data['content'].tolist()
validation_texts = validation_data['content'].tolist()
tokenized_training_data = preprocess_function(training_texts)
tokenized_validation_data = preprocess_function(validation_texts)

# Create PyTorch datasets
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(tokenized_training_data, training_labels)
eval_dataset = SentimentDataset(tokenized_validation_data, validation_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model and tokenizer
model.save_pretrained("./saved_model_2")
tokenizer.save_pretrained("./saved_model_2")

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Wrap tokenized data in a Dataset object
predict_dataset = SentimentDataset(tokenized_validation_data, validation_labels)

predictions = trainer.predict(predict_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

accuracy = accuracy_score(validation_labels, predicted_labels) # Use validation_labels instead of validation_data['sentiment']
precision, recall, f1, _ = precision_recall_fscore_support(validation_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Reload model and test on different dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Specify the directory where the model was saved
model_directory = "./saved_model_2"

# Load the fine-tuned model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_directory)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Create a pipeline for sequence classification
classifier = pipeline("text-classification", model=loaded_model, tokenizer=loaded_tokenizer, return_all_scores=False)

df = pd.read_csv('/content/Twitter_Data.csv')
df = df.dropna()

test_text = df['content'].tolist()
test_labels = df['sentiment'].tolist()

predictions = classifier(test_text)

print(predictions)

accuracy = sum(1 for pred, label in zip(predictions, test_labels) if pred == label) / len(test_labels)
print("Accuracy:", accuracy)
