In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 69
TOKEN_LIMIT = 400
TORCH_SEED = 69

In [None]:
data = pd.read_csv('finetuning_dataset.csv')
twitter_data = data[data['source'] == 'Twitter'].sample(frac=0.3, random_state=RANDOM_SEED)
other_data = data[data['source'] != 'Twitter']
sampled_data = pd.concat([twitter_data, other_data])

print(sampled_data['source'].value_counts().sum())

In [None]:
dataset = Dataset.from_pandas(sampled_data)
dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained("./mlm_expanded_model")

In [None]:
def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, max_length=TOKEN_LIMIT)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
df = tokenized_dataset.to_pandas()

train_df, eval_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['source'],
    random_state=RANDOM_SEED
)

eval_df = eval_df.reset_index(drop=True)
eval_dataset = Dataset.from_pandas(eval_df)
eval_dataset = eval_dataset.map(lambda row: {'labels': row['polarity']})

print(f"Evaluation size: {len(eval_dataset)}")

In [None]:
eval_dataset

In [None]:
record = eval_dataset[0]
print(record)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
model = BertForSequenceClassification.from_pretrained("./mlm_expanded_model", num_labels=2)

In [None]:
batch_size = 16

training_args = TrainingArguments(
    output_dir="./results",           
    per_device_eval_batch_size=batch_size,
    do_train=False,                   
    do_eval=True,                     
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)