In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 69
TOKEN_LIMIT = 400
TORCH_SEED = 69

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
torch.manual_seed(TORCH_SEED)

In [None]:
data = pd.read_csv('finetuning_dataset.csv')
twitter_data = data[data['source'] == 'Twitter']
twitter_sample = twitter_data.sample(frac=0.3, random_state=RANDOM_SEED)

twitter_remaining = twitter_data.drop(twitter_sample.index)
twitter_remaining_sample = twitter_remaining.sample(n=303556, random_state=RANDOM_SEED)

print(twitter_sample['polarity'].value_counts())
print(twitter_remaining_sample['polarity'].value_counts())
print(twitter_remaining_sample['source'].value_counts())

In [None]:
other_data = data[data['source'] != 'Twitter']
sampled_data = pd.concat([twitter_sample, other_data])
print(sampled_data['source'].value_counts().sum())
print(twitter_remaining_sample['source'].value_counts().sum())

In [None]:
dataset = Dataset.from_pandas(sampled_data)
twitter_only_dataset = Dataset.from_pandas(twitter_remaining_sample)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, max_length=TOKEN_LIMIT)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_twitter_only_dataset = twitter_only_dataset.map(tokenize_function, batched=True)

In [None]:
df = tokenized_dataset.to_pandas()

train_df, eval_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['source'],
    random_state=RANDOM_SEED
)

train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

train_dataset = train_dataset.map(lambda row: {'labels': row['polarity']})
eval_dataset = eval_dataset.map(lambda row: {'labels': row['polarity']})
tokenized_twitter_only_dataset = tokenized_twitter_only_dataset.map(lambda row: {'labels': row['polarity']})

print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}, Twitter Only Dataset Size: {len(tokenized_twitter_only_dataset)}")

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
batch_size = 16
epochs = 3
total_steps = (len(train_dataset) // batch_size) * epochs

# Train with only Twitter Data

In [None]:
base_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./twitter_only_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_twitter_only_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

trainer.evaluate()

base_model.save_pretrained('./twitter_only_model')
tokenizer.save_pretrained('./twitter_only_model')

In [None]:
trainer.predict(eval_dataset)

# Evaluation

In [None]:
sem_eval_df = pd.read_csv('sem_eval_2018_test_binary.csv')
sem_eval_dataset = Dataset.from_pandas(sem_eval_df)
tokenized_sem_eval_dataset = sem_eval_dataset.map(tokenize_function, batched=True)

In [None]:
trainer.predict(tokenized_sem_eval_dataset)

In [None]:
default_base_model = BertForSequenceClassification.from_pretrained("./base_model", num_labels=2)

In [None]:
eval_training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    logging_dir="./logs",
    report_to="none"
)

eval_trainer = Trainer(
    model=default_base_model,
    args=eval_training_args,
    eval_dataset=tokenized_sem_eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

In [None]:
eval_trainer.predict(tokenized_sem_eval_dataset)