In [1]:
!pip install datasets
!pip install evaluate
!pip install -U accelerate
!pip install -U transformers





In [2]:
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
import evaluate
import re
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data preprocessing

In [3]:
def dataset_to_list(dataset: Dataset) -> list[str]:
    """
    Changes a dataset of conversation data into a list of single turn chat message.
    
    :param dataset: The dataset containing chit chat conversation message.
    :return: A list of chat message.
    """
    list_of_tweet = []
    for row in dataset:
        data = row["dialog"]
        list_of_tweet.extend(data)
    return list_of_tweet

In [4]:
# Load the dataset
dataset_trump = load_dataset("rguo123/trump_tweets")
dataset_twitter = load_dataset("daily_dialog")

# Create validation and testing dataset for trump tweets.
dataset_size = len(dataset_trump['train'])
train_size = int(0.8 * dataset_size)
valid_size = int(0.1 * dataset_size)

# Shuffle the dataset
shuffled_dataset = dataset_trump['train'].shuffle(seed=0)

# Split the dataset
train_dataset = shuffled_dataset.select(range(train_size))
valid_dataset = shuffled_dataset.select(range(train_size, train_size + valid_size))
test_dataset = shuffled_dataset.select(range(train_size + valid_size, dataset_size))

# Create a new DatasetDict
dataset_trump = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

# Preprocessing for false Trump tweets
train_data = {"tweet": dataset_to_list(dataset_twitter["train"])}
validation_data = {"tweet": dataset_to_list(dataset_twitter["validation"])}
test_data = {"tweet": dataset_to_list(dataset_twitter["test"])}

train_data = Dataset.from_dict(train_data)
validation_data = Dataset.from_dict(validation_data)
test_data = Dataset.from_dict(test_data)

dataset_twitter = DatasetDict({
        "train": train_data,
        "validation": validation_data,
        "test": test_data
    } 
)

In [5]:
# Tokenize datasets functions
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
data_collator = DataCollatorWithPadding(tokenizer) # using dynamic padding

def tokenize_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

def true_trump_mapping(row):
    return {"labels": 1}

def false_trump_mapping(row):
    return {"labels": 0}

In [6]:
# This if statement is to make sure it is only run once. 
if "tweet" not in dataset_trump["train"].column_names:
    # Rename the actual tweet column in each dataset as tweet for tokenization
    dataset_trump = dataset_trump.rename_column('content', 'tweet')

    # Remove every column other than tweet
    trump_column_names = dataset_trump["train"].column_names
    trump_column_names.remove("tweet")
    dataset_trump = dataset_trump.remove_columns(trump_column_names)

    # Prepare the dataset
    tokenized_dataset_trump = dataset_trump.map(tokenize_function)
    dataset_trump = tokenized_dataset_trump.map(true_trump_mapping)
    tokenized_dataset_twitter = dataset_twitter.map(tokenize_function)
    dataset_twitter = tokenized_dataset_twitter.map(false_trump_mapping)

Map:   0%|          | 0/4335 [00:00<?, ? examples/s]

Map:   0%|          | 0/4335 [00:00<?, ? examples/s]

Map:   0%|          | 0/87170 [00:00<?, ? examples/s]

Map:   0%|          | 0/8069 [00:00<?, ? examples/s]

Map:   0%|          | 0/7740 [00:00<?, ? examples/s]

Map:   0%|          | 0/87170 [00:00<?, ? examples/s]

Map:   0%|          | 0/8069 [00:00<?, ? examples/s]

Map:   0%|          | 0/7740 [00:00<?, ? examples/s]

In [7]:
# Merge the two datasets.
merge_dataset = DatasetDict({
    'train': concatenate_datasets([dataset_trump["train"], dataset_twitter["train"]]),
    'validation': concatenate_datasets([dataset_trump["validation"], dataset_twitter["validation"]]),
    'test': concatenate_datasets([dataset_trump["test"], dataset_twitter["test"]]),
})

merge_dataset = merge_dataset.shuffle(seed=0)

# Training

In [8]:
# Instantiate the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Training hyperparameters
training_hyperparameter = TrainingArguments(output_dir="output_dir",
                                            evaluation_strategy="epoch",
                                            gradient_accumulation_steps=128,
                                            per_device_train_batch_size=16,
                                            per_device_eval_batch_size=16,
                                            num_train_epochs=3,
                                            warmup_steps=500,
                                            learning_rate=2e-5,
                                            weight_decay=1e-3,)

In [10]:
# Prepare for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
# Perform training
trainer = Trainer(
    model=model,
    args=training_hyperparameter,
    train_dataset=merge_dataset["train"],
    eval_dataset=merge_dataset["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer # using dynamic padding
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.39781,0.957191
2,No log,0.01561,0.994921


TrainOutput(global_step=177, training_loss=0.27076251358635683, metrics={'train_runtime': 3040.1635, 'train_samples_per_second': 120.241, 'train_steps_per_second': 0.058, 'total_flos': 1.191713836374336e+16, 'train_loss': 0.27076251358635683, 'epoch': 2.97})

# Testing

In [12]:
# Make prediction
prediction = trainer.predict(merge_dataset["test"])
print(f"Test accuracy is {prediction.metrics['test_accuracy']}")

Test accuracy is 0.9947830407419676


# Inference

### Examples

In [13]:
def make_prediction(tweet: str) -> None:
    encoded_input = tokenizer(tweet, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.no_grad():
        output = model(**encoded_input)
    if torch.argmax(output.logits).item() == 0:
        print("This is not a Trump tweet")
    else:
        print("This is a Trump tweet")

In [14]:
# Examples
trump_tweet = "Make America great again!"
make_prediction(trump_tweet)
my_tweet = "Yo bro, what's up"
make_prediction(my_tweet)

This is a Trump tweet
This is not a Trump tweet


### Try it out yourself!

In [15]:
# Enter your sentence here
sentence = "Tonight, @FLOTUS and I tested positive for COVID-19. We will begin our quarantine and recovery process immediately. We will get through this TOGETHER!"
make_prediction(sentence)

This is a Trump tweet
