# Requirements

In [None]:
!pip install transformers[sentencepiece]
!pip install datasets==1.16.1

Collecting transformers[sentencepiece]
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 514 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.2 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp

In [None]:
import transformers
from transformers import pipeline
from google.colab import drive
import os
import pandas as pd

In [None]:
import torch

use_gpu = True

if use_gpu :
  # Check the GPU is detected
  if not torch.cuda.is_available():
    print("ERROR: No GPU detected. Add a GPU.")
    assert False
  # Get the GPU device name.
  device_name = torch.cuda.get_device_name()
  n_gpu = torch.cuda.device_count()
  print("Found device: {}, n_gpu: {}".format(device_name, n_gpu))
else:
  # Check that no GPU is detected
  if torch.cuda.is_available():
    print("ERROR: GPU detected.")
    print("Remove the GPU or set the use_gpu flag to True.")
    assert False
  print("No GPU found. Using CPU.")
  print("WARNING: Without a GPU, your code in Parts 4 and 5 will be extremely slow.")

Found device: Tesla K80, n_gpu: 1


#Setup

In [None]:
# Mount Google Drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
os.chdir('/gdrive/MyDrive')

In [None]:
%cd "./Final Project/Data"

/gdrive/MyDrive/Final Project/Data


In [None]:
df = pd.read_csv("taylor_swift_combined_with_normalizations.csv", encoding="utf-8")
lyrics = list(df["lyric"])

# to make sure we're not gonna surpass the 512 token limit
max_len = max(map(lambda x: len(x), lyrics))
print(max_len)

87


#Data Exploration
Even though we already did this, we should check on how imbalanced our data is if we just do positive and negative.

In [None]:
num_negative = 0
num_positive = 0

valence_values = df["valence"]

for valence in valence_values:
  if valence < 0.5:
    num_negative += 1
  else:
    num_positive += 1

print("NEGAITVE: ", num_negative)
print("POSITIVE: ", num_positive)

NEGAITVE:  2086
POSITIVE:  1069


# Processing Data
Main tutorial: https://huggingface.co/course/chapter3/1?fw=pt

Processing data tutorial: https://huggingface.co/course/chapter3/2?fw=pt

In [None]:
valence_values = df["valence"]
valences = []
for valence in valence_values:
  if valence < 0.5:
    valences.append(0)
  else:
    valences.append(1)
df["label"] = valences

In [None]:
# Set up tokenizer
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df,
                          random_state=42, shuffle=True,
                          test_size=.2)

train_df, val_df = train_test_split(train_df,
                          random_state=42, shuffle=True,
                          test_size=.2)

In [None]:
train_text = list(train_df["lyric"])
train_labels = list(train_df["label"])

test_text = list(test_df["lyric"])
test_labels = list(test_df["label"])

val_text = list(val_df["lyric"])
val_labels = list(val_df["label"])

In [None]:
print(len(train_df), len(test_df), len(val_df))

2019 631 505


In [None]:
from torch.utils.data import Dataset, DataLoader

class TaylorSwiftDataset(torch.utils.data.Dataset):
  #def __init__(self, texts, labels, tokenizer):
  def __init__(self, encodings, labels):
    #self.texts = texts
    self.encodings = encodings
    self.labels = labels
    self.tokenizer = tokenizer

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
train_encodings = tokenizer(train_text, truncation=True)#, padding=True)
val_encodings = tokenizer(val_text, truncation=True)#, padding=True)
test_encodings = tokenizer(test_text, truncation=True)#, padding=True)

In [None]:
train_dataset = TaylorSwiftDataset(train_encodings, train_labels)
val_dataset = TaylorSwiftDataset(val_encodings, val_labels)
test_dataset = TaylorSwiftDataset(test_encodings, test_labels)

In [None]:
# Source: https://huggingface.co/transformers/training.html
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

# Fine-tuning a pretrained model
Fine-tuning tutoral: https://huggingface.co/course/chapter3/3?fw=pt


In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer


training_args = TrainingArguments("test-trainer")

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2)

trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # function to be used in evaluation
    tokenizer=tokenizer,                 # enable dynamic padding
)



Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
trainer.train()

***** Running training *****
  Num examples = 2019
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 759


Step,Training Loss
500,0.587


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=759, training_loss=0.47234062023941864, metrics={'train_runtime': 194.0387, 'train_samples_per_second': 31.215, 'train_steps_per_second': 3.912, 'total_flos': 50263975460580.0, 'train_loss': 0.47234062023941864, 'epoch': 3.0})

In [None]:
import numpy
# Print test accuracy
test_results = trainer.predict(test_dataset)
acc = (test_results.predictions.argmax(1)==test_results.label_ids).sum()/len(test_dataset)
print(acc.item())

***** Running Prediction *****
  Num examples = 631
  Batch size = 8


0.6782884310618067


In [None]:
training_args_2 = TrainingArguments(
    output_dir='./results',         # output directory
    num_train_epochs=4,             # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    evaluation_strategy="epoch",    # evaluation occurs after each epoch
    logging_dir='./logs',           # directory for storing logs
    logging_strategy="epoch",       # logging occurs after each epcoch
    learning_rate = 3e-5,
    seed = 130
    
)

model_2 = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2)

trainer_2 = Trainer(
    model=model_2,                         # the model to be trained
    args=training_args_2,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # function to be used in evaluation
    tokenizer=tokenizer,                 # enable dynamic padding
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": 

Step,Training Loss
500,0.1198


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=759, training_loss=0.12259214197694078, metrics={'train_runtime': 165.9001, 'train_samples_per_second': 36.51, 'train_steps_per_second': 4.575, 'total_flos': 50114947714380.0, 'train_loss': 0.12259214197694078, 'epoch': 3.0})

In [None]:
test_results = trainer_2.predict(test_dataset)
acc = (test_results.predictions.argmax(1)==test_results.label_ids).sum()/len(test_dataset)
print(acc.item())

***** Running Prediction *****
  Num examples = 631
  Batch size = 64


0.5467511885895404


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

training_args_3 = TrainingArguments(
    output_dir='./results',         # output directory
    num_train_epochs=4,             # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    evaluation_strategy="epoch",    # evaluation occurs after each epoch
    logging_dir='./logs',           # directory for storing logs
    logging_strategy="epoch",       # logging occurs after each epcoch
    # YOUR CODE HERE!
    learning_rate = 3e-5,
    seed = 130
    
)

model_3 = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2)

trainer_3 = Trainer(
    model=model_3,                         # the model to be trained
    args=training_args_3,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # function to be used in evaluation
    tokenizer=tokenizer,                 # enable dynamic padding
)

trainer_3.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": 

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6424,0.636228,0.657426
2,0.5363,0.607788,0.708911
3,0.2688,1.119203,0.671287
4,0.1139,1.356217,0.693069


***** Running Evaluation *****
  Num examples = 505
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 505
  Batch size = 64
***** Running Evaluation *****
  Num examples = 505
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 505
  Batch size = 64


Training completed. Do not forget to share yo

TrainOutput(global_step=1012, training_loss=0.39033030144310754, metrics={'train_runtime': 235.2253, 'train_samples_per_second': 34.333, 'train_steps_per_second': 4.302, 'total_flos': 66974096919840.0, 'train_loss': 0.39033030144310754, 'epoch': 4.0})