In [13]:
%cd drive/MyDrive/

/content/drive/MyDrive


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
!pip install torch numpy tqdm transformers datasets sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import os, numpy as np
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset, DatasetDict

In [17]:
train_bodies_path = "fnc/train_bodies.csv"
train_headlines_path = "fnc/train_stances.csv"

test_bodies_path = "fnc/competition_test_bodies.csv"
test_headlines_path= "fnc/competition_test_stances.csv"

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

In [19]:
def get_score(y_pred, y_true):
    n = len(y_pred)
    related = ["agree","disagree","discuss"]
    score = 0
    for i in range(n):
        if y_pred[i] == y_true[i]:
            score+=0.25
            if y_true[i]!='unrelated':
                score+=0.50
        if y_pred[i] in related and y_true[i] in related:
            score+=0.25
    print("Model Score:",score)
    return

In [20]:
def get_data(bodies_path, headlines_path):
  bodies_df = pd.read_csv(bodies_path)
  stances_df = pd.read_csv(headlines_path)
  df = pd.merge(bodies_df, stances_df, how='right', on='Body ID')
  
  features = []
  headlines = df['Headline']
  body = df['articleBody']
  for i in range(len(headlines)):
    f = headlines[i] + body[i]
    features.append(f)

  #features = list(headlines)
  LABELS_MAP = {'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3}
  labels = [LABELS_MAP[x] for x in df['Stance']]

  return features, labels

In [21]:
features_train, labels_train = get_data(train_bodies_path, train_headlines_path)

In [22]:
x_test, y_test = get_data(test_bodies_path, test_headlines_path)

In [23]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(features_train, labels_train, stratify=labels_train, train_size=0.80)

In [24]:
import collections
print(collections.Counter(y_train))
print(collections.Counter(y_val))

Counter({3: 29236, 2: 7127, 0: 2942, 1: 672})
Counter({3: 7309, 2: 1782, 0: 736, 1: 168})


In [25]:
data_dict = DatasetDict(
    train=Dataset.from_dict({"text": x_train, "label": y_train}),
    val=Dataset.from_dict({"text": x_val, "label": y_val}),
    test=Dataset.from_dict({"text": x_test, "label": y_test}),
)

In [26]:
def preprocess(example):
    return tokenizer(example['text'], max_length=80,truncation=True)
encoded_dataset = data_dict.map(preprocess, batched=True)

  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [37]:
backbone = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=4
    )
    # https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
        "checkpoints",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-5,
        warmup_ratio=0.1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        num_train_epochs=1,
        metric_for_best_model="accuracy",
        per_device_eval_batch_size=64,
        per_device_train_batch_size=64,
    )

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1

In [38]:
trainer = Trainer(
        backbone,
        training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["val"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 39977
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0722,0.113093,0.963682,0.962195,0.963682,0.962087
2,0.0727,0.113093,0.963682,0.962195,0.963682,0.962087
3,0.074,0.113093,0.963682,0.962195,0.963682,0.962087
4,0.0734,0.113093,0.963682,0.962195,0.963682,0.962087
5,0.0703,0.113093,0.963682,0.962195,0.963682,0.962087


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9995
  Batch size = 64
Saving model checkpoint to checkpoints/checkpoint-625
Configuration saved in checkpoints/checkpoint-625/config.json
Model weights saved in checkpoints/checkpoint-625/pytorch_model.bin
tokenizer config file saved in checkpoints/checkpoint-625/tokenizer_config.json
Special tokens file saved in checkpoints/checkpoint-625/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples

TrainOutput(global_step=3125, training_loss=0.07230766418457031, metrics={'train_runtime': 1401.2131, 'train_samples_per_second': 142.651, 'train_steps_per_second': 2.23, 'total_flos': 4137373497532800.0, 'train_loss': 0.07230766418457031, 'epoch': 5.0})

In [30]:
trainer.evaluate(eval_dataset=encoded_dataset["test"], metric_key_prefix="test")    

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25413
  Batch size = 64


{'epoch': 5.0,
 'test_accuracy': 0.8906465194978948,
 'test_f1': 0.8848715857581613,
 'test_loss': 0.38790270686149597,
 'test_precision': 0.8829542785114632,
 'test_recall': 0.8906465194978948,
 'test_runtime': 56.4543,
 'test_samples_per_second': 450.151,
 'test_steps_per_second': 7.05}

#### Predictions

In [39]:
predictions = trainer.predict(encoded_dataset["test"])
preds = predictions.predictions.argmax(-1)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 25413
  Batch size = 64


In [40]:
pred_arr = []
for p in preds:
  pred_arr.append(Int_to_label[p])
print(pred_arr)

['unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'unrelated', 'discuss', 'unrelated', 'discuss', 'disagree', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'agree', 'agree', 'agree', 'agree', 'unrelated', 'unrelated', 'discuss', 'discuss', 'discuss', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'discuss', 'discuss', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'unrelated', 'discuss', 'unrelated', 'unrelated', 'unrelated', 'unrelated', 'disagree', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'unrelated', 'unrelated', 'unrelated', 'discuss', 'unrelated', 'agree', 'unrelated', 'agree', 'unrelated', 'agree', 'unrelated', 'discuss', 'unrelated', 'unrelated', 'discuss

In [41]:
Int_to_label = {0:'agree', 1:'disagree', 2:'discuss', 3:'unrelated'}

In [35]:
df_test = pd.read_csv(test_headlines_path)
len(df_test['Stance'])

25413

In [42]:
get_score(pred_arr,df_test['Stance'])

Model Score: 9732.0


In [None]:
import pandas as pd
test_headlines = pd.read_csv(test_headlines_path).drop(["Stance"], axis=1)
test_headlines["Stance"] = pred_arr
test_headlines.head()

Unnamed: 0,Headline,Body ID,Stance
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated


In [None]:
test_headlines.to_csv("transformer.csv",index=False)
from google.colab import files
files.download("transformer.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>