# handles the loading and preprocessing of the data 


In [None]:
!pip install transformers datasets accelerate torch pandas tqdm scikit-learn
!pip install optuna

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

import optuna
import torch
import numpy as np
import os
import optuna

# for warning
os.environ["TOKENIZERS_PARALLELISM"] = "true"
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def load_text_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f]

use_full_dataset = True

if use_full_dataset:
    pos_file = "/data/train_pos_full.txt"
    neg_file = "/data/train_neg_full.txt"
else:
    pos_file = "/data/train_pos.txt"
    neg_file = "/data/train_neg.txt"

positive_texts = load_text_file(pos_file)
negative_texts = load_text_file(neg_file)

train_texts = positive_texts + negative_texts
train_labels = [1] * len(positive_texts) + [0] * len(negative_texts)

# validation split
val_size = int(0.1 * len(train_texts))
dataset = DatasetDict({
    'train': Dataset.from_dict({
        'text': train_texts[val_size:],
        'label': train_labels[val_size:]
    }),
    'validation': Dataset.from_dict({
        'text': train_texts[:val_size],
        'label': train_labels[:val_size]
    })
})

MAX_LENGTH = 40

def preprocess_function(examples):
    # Basic preprocessing (not needed but is from the hugging face example)
    texts = [text.replace('@user', '@USERNAME') for text in examples['text']]
    texts = [text.replace('http', 'URL') for text in texts]
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors=None
    )

encoded_dataset = dataset.map(preprocess_function, batched=True)

# Load model
#model = AutoModelForSequenceClassification.from_pretrained(
 #   MODEL,
 #   num_labels=2, # pos and negative
#    ignore_mismatched_sizes=True
#)

# Define metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = (predictions == labels).mean()

    return {
        'accuracy': accuracy,
    }

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        MODEL,
        num_labels=2,
        ignore_mismatched_sizes=True
    )


# This cell handles the hyperparameter search and creates a model with the best hyperparameterrs.

In [None]:

initial_training_args = TrainingArguments(
    output_dir="./binary-sentiment-finetuned-search",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    fp16=True
)

# nitialize first trainer for hyperparameter search
search_trainer = Trainer(
    model_init=model_init,
    args=initial_training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)

# run hyperparameter search
best_run = search_trainer.hyperparameter_search(
    backend="optuna",
    n_trials=10,
    direction="maximize",
    hp_space=lambda trial: {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [1028]),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.1, log=True),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
    }
)

print("Best hyperparameters:", best_run.hyperparameters)

# choose new training arguments with best hyperparameters
final_training_args = TrainingArguments(
    output_dir="./binary-sentiment-finetuned-final",
    learning_rate=best_run.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_run.hyperparameters["num_train_epochs"],
    weight_decay=best_run.hyperparameters["weight_decay"],
    warmup_ratio=best_run.hyperparameters["warmup_ratio"],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    fp16=True
)

# Initialize final trainer with best hyperparameters
final_trainer = Trainer(
    model_init=model_init,
    args=final_training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)

# Train final model
final_trainer.train()

# Keep a reference to the final model for inference
model = final_trainer.model.to(device)

# Contains an example for finetuning the model with specified trianing arguments.


In [None]:
training_args = TrainingArguments(
output_dir="./binary-sentiment-finetuned-full",
    learning_rate= 4.030628115465328e-05,
    num_train_epochs= 3,
    per_device_train_batch_size= 1028,
    weight_decay= 0.016583152835062398,
    warmup_ratio= 0.08091678678536013,                    
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",             
    push_to_hub=False,
    fp16=True                           
)

# Initialize trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)



# Train the model
trainer.train()


# Does inference on the trained model and outputs the CSV file

In [None]:
import pandas as pd
import torch
from tqdm import tqdm

def predict_sentiment_batch(texts, model, tokenizer, device, max_length=40, batch_size=64):
    model.eval()
    predictions = []

    # process in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            predictions.extend(outputs.logits.argmax(dim=-1).cpu().numpy())

    # convert 0/1 to -1/1 (could change in the Trainer)
    return [1 if pred == 1 else -1 for pred in predictions]

# get device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# get model from trainer and move to device
model = trainer.model
model = model.to(device)

# load test data
test_texts = load_text_file("/data/test_data.txt")

# make the predictions
predictions = predict_sentiment_batch(
    test_texts,
    model,
    tokenizer,
    device,
    max_length=MAX_LENGTH,
    batch_size=training_args.per_device_train_batch_size
)

# create DataFrame with Id and Prediction
df = pd.DataFrame({
    'Id': range(1, len(predictions) + 1),
    'Prediction': predictions
})

# save the result to CSV file
df.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")

# quick display first few predictions
print("\nFirst few predictions:")
print(df.head(10))

# This is a util function that finds good max_length for tokenizer


In [None]:
from transformers import AutoTokenizer
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def analyze_text_lengths(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        texts = [line.strip() for line in f]

    lengths = [len(tokenizer.encode(text)) for text in texts]

    stats = {
        'mean': np.mean(lengths),
        'median': np.median(lengths),
        'max': np.max(lengths),
        'min': np.min(lengths),
        '95th_percentile': np.percentile(lengths, 95),
        'total_samples': len(lengths)
    }

    return lengths, stats

files = [
    '/data/train_pos.txt',
    '/data/train_neg.txt',
    '/data/test_data.txt'
]

all_stats = {}
all_lengths = {}

for file_path in files:
    print(f"\nAnalyzing {file_path.split('/')[-1]}:")
    lengths, stats = analyze_text_lengths(file_path)
    all_stats[file_path] = stats
    all_lengths[file_path] = lengths

    print(f"Number of samples: {stats['total_samples']}")
    print(f"Mean length: {stats['mean']:.2f}")
    print(f"Median length: {stats['median']:.2f}")
    print(f"Max length: {stats['max']}")
    print(f"95th percentile: {stats['95th_percentile']:.2f}")



all_95th = max(stats['95th_percentile'] for stats in all_stats.values())
all_max = max(stats['max'] for stats in all_stats.values())

print(f"all texts): {int(all_max)}")
print(f"95% of texts): {int(all_95th)}")
print(f"Efficient: {int(max(stats['median'] for stats in all_stats.values()) * 1.5)}")