### Fine-tuning a LLM to text classification ###

In [1]:
import random

# 1. Load Positive Tweets
with open("twitter-datasets/train_pos_full.txt", "r", encoding="utf-8") as file:
    positive_tweets = file.readlines()
    positive_tweets = positive_tweets[:len(positive_tweets)//10]  # Select only half of the positive tweets

# 2. Load Negative Tweets
with open("twitter-datasets/train_neg_full.txt", "r", encoding="utf-8") as file:
    negative_tweets = file.readlines()
    negative_tweets = negative_tweets[:len(negative_tweets)//10]  # Select only half of the negative tweets

# 3. Create the texts and labels lists
texts = positive_tweets + negative_tweets
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)

# 4. Shuffle the dataset
data = list(zip(texts, labels))  # Combine texts and labels into tuples
random.shuffle(data)  # Shuffle the dataset

# 5. Unzip the shuffled data back into texts and labels
texts, labels = zip(*data)

# Convert back to list if you need to work with them as lists
texts = list(texts)
labels = list(labels)

# Check the first few entries to ensure everything looks correct

In [None]:
from transformers import AutoTokenizer

# Load a pretrained tokenizer
model_name = "distilbert-base-uncased"  # You can replace this with another model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the data
tokenized_data = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Inspect tokenized output
#print(tokenized_data)


In [3]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# Create dataset and dataloaders
dataset = TweetDataset(texts, labels, tokenizer)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)


In [4]:
from transformers import AutoModelForSequenceClassification

# Load model with classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset

# Prepare the dataset for Trainer
hf_dataset = Dataset.from_dict({"text": texts, "label": labels})
tokenized_dataset = hf_dataset.map(
    lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=128),
    batched=True
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/250000 [00:00<?, ? examples/s]

In [6]:
import accelerate

# Train the model
# Initialize the Trainer
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)  # Move model to the MPS device

print("Using device:", device)

Using device: mps


In [8]:

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="no",
    #no_cuda=True,  # Disable CUDA to avoid conflicts with MPS
    use_mps_device=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Use a separate dataset for evaluation in real use
)

trainer.train()


  0%|          | 0/11721 [00:00<?, ?it/s]

{'loss': 0.6916, 'grad_norm': 0.7164437174797058, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 0.689, 'grad_norm': 0.5984341502189636, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 0.6838, 'grad_norm': 0.8345850706100464, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 0.6822, 'grad_norm': 1.093787670135498, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}
{'loss': 0.6679, 'grad_norm': 0.7253750562667847, 'learning_rate': 5e-06, 'epoch': 0.01}
{'loss': 0.657, 'grad_norm': 0.8520417213439941, 'learning_rate': 6e-06, 'epoch': 0.02}
{'loss': 0.629, 'grad_norm': 1.2635774612426758, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}
{'loss': 0.5905, 'grad_norm': 1.2798315286636353, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}
{'loss': 0.5695, 'grad_norm': 1.3972744941711426, 'learning_rate': 9e-06, 'epoch': 0.02}
{'loss': 0.5325, 'grad_norm': 1.7494885921478271, 'learning_rate': 1e-05, 'epoch': 0.03}
{'loss': 0.4819, 'grad_norm': 3.3

KeyboardInterrupt: 

In [9]:
trainer.save_model("./fine_tuned_twitter_roberta")
tokenizer.save_pretrained("./fine_tuned_twitter_roberta")

('./fine_tuned_twitter_roberta/tokenizer_config.json',
 './fine_tuned_twitter_roberta/special_tokens_map.json',
 './fine_tuned_twitter_roberta/vocab.txt',
 './fine_tuned_twitter_roberta/added_tokens.json',
 './fine_tuned_twitter_roberta/tokenizer.json')

In [10]:
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/3907 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_twitter_roberta")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_twitter_roberta")

data_path = "data/twitter-datasets/"
test_path = f"{data_path}test_data.txt"


with open(test_path, "r") as f:
    test_tweets = [line.strip() for line in f]


In [13]:
from tqdm import tqdm
labels = []

# Analyze sentiment
for tweet in tqdm(test_tweets):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    negative_prob, positive_prob = probs[0].tolist()
    if positive_prob > negative_prob:
        label = 1  # Positive
    else:
        label = -1  # Negative
    labels.append(label)

ids = np.arange(1, len(labels) + 1)



[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

In [14]:
from helpers import create_csv_submission

create_csv_submission(ids, labels, "data/submission_fine_tuned_twitter_roberta.csv")


### Before fine-tuning ###

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_twitter_roberta")

data_path = "data/twitter-datasets/"
test_path = f"{data_path}test_data.txt"


with open(test_path, "r") as f:
    test_tweets = [line.strip() for line in f]

In [None]:
from tqdm import tqdm
labels = []

# Analyze sentiment
for tweet in tqdm(test_tweets):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    negative_prob, positive_prob = probs[0].tolist()
    if positive_prob > negative_prob:
        label = 1  # Positive
    else:
        label = -1  # Negative
    labels.append(label)

ids = np.arange(1, len(labels) + 1)

In [None]:
from helpers import create_csv_submission

create_csv_submission(ids, labels, "data/submission_fine_tuned_twitter_roberta.csv")
