In [1]:
%cd drive/MyDrive/MSCI641_Text_Analytics/

/content/drive/MyDrive/MSCI641_Text_Analytics


In [None]:
!pip install torch numpy tqdm transformers datasets sklearn

In [10]:
import os, numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset, DatasetDict

In [4]:
train_bodies_path = "fnc_data/train_bodies.csv"
train_headlines_path = "fnc_data/train_stances.csv"

test_bodies_path = "fnc_data/competition_test_bodies.csv"
test_headlines_path= "fnc_data/competition_test_stances.csv"

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

In [7]:
def get_data(bodies_path, headlines_path):
  bodies_df = pd.read_csv(bodies_path)
  stances_df = pd.read_csv(headlines_path)
  df = pd.merge(bodies_df, stances_df, how='right', on='Body ID')
  
  features = []
  headlines = df['Headline']
  body = df['articleBody']
  for i in range(len(headlines)):
    f = headlines[i] + body[i]
    features.append(f)

  #features = list(headlines)
  LABELS_MAP = {'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3}
  labels = [LABELS_MAP[x] for x in df['Stance']]

  return features, labels

In [28]:
def get_score(y_pred, y_true):
    n = len(y_pred)
    related = ["agree","disagree","discuss"]
    score = 0
    for i in range(n):
        if y_pred[i] == y_true[i]:
            score+=0.25
            if y_true[i]!='unrelated':
                score+=0.50
        if y_pred[i] in related and y_true[i] in related:
            score+=0.25
    print("Model Score:",score)
    return

In [13]:
features_train, labels_train = get_data(train_bodies_path, train_headlines_path)

In [15]:
x_test, y_test = get_data(test_bodies_path, test_headlines_path)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(features_train, labels_train, stratify=labels_train, train_size=0.80)

In [None]:
import collections
print(collections.Counter(y_train))
print(collections.Counter(y_val))

In [16]:
data_dict = DatasetDict(
    #train=Dataset.from_dict({"text": x_train, "label": y_train}),
    train=Dataset.from_dict({"text":  features_train, "label": labels_train}),
    #val=Dataset.from_dict({"text": x_val, "label": y_val}),
    test=Dataset.from_dict({"text": x_test, "label": y_test}),
)

In [36]:
def preprocess(example):
    return tokenizer(example['text'], max_length=300, truncation=True)
encoded_dataset = data_dict.map(preprocess, batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [37]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 49972
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25413
    })
})

In [38]:
backbone = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=4
    )
    # https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
        "checkpoints",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-5,
        warmup_ratio=0.1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        num_train_epochs=1,
        metric_for_best_model="accuracy",
        #per_device_eval_batch_size=64,
        #per_device_train_batch_size=64,
    )

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1

In [45]:
trainer = Trainer(
        backbone,
        training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["test"],
        #eval_dataset=encoded_dataset["val"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 49972
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6247


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25413
  Batch size = 8


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1601,0.360448,0.898674,0.898674,0.898674,0.898674


Saving model checkpoint to checkpoints/checkpoint-6247
Configuration saved in checkpoints/checkpoint-6247/config.json
Model weights saved in checkpoints/checkpoint-6247/pytorch_model.bin
tokenizer config file saved in checkpoints/checkpoint-6247/tokenizer_config.json
Special tokens file saved in checkpoints/checkpoint-6247/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from checkpoints/checkpoint-6247 (score: 0.898673907055444).


TrainOutput(global_step=6247, training_loss=0.15925717098113346, metrics={'train_runtime': 1750.8087, 'train_samples_per_second': 28.542, 'train_steps_per_second': 3.568, 'total_flos': 3878845869225600.0, 'train_loss': 0.15925717098113346, 'epoch': 1.0})

In [40]:
trainer.evaluate(eval_dataset=encoded_dataset["test"], metric_key_prefix="test")    

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25413
  Batch size = 8


{'epoch': 1.0,
 'test_accuracy': 0.898673907055444,
 'test_f1': 0.898673907055444,
 'test_loss': 0.36044758558273315,
 'test_precision': 0.898673907055444,
 'test_recall': 0.898673907055444,
 'test_runtime': 256.3595,
 'test_samples_per_second': 99.13,
 'test_steps_per_second': 12.393}

#### Predictions

In [46]:
predictions = trainer.predict(encoded_dataset["test"])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 25413
  Batch size = 8


In [47]:
Int_to_label = {0:'agree', 1:'disagree', 2:'discuss', 3:'unrelated'}

In [51]:
preds = predictions.predictions.argmax(-1)

pred_arr = []
for p in preds:
  pred_arr.append(Int_to_label[p])

In [52]:
df_test = pd.read_csv(test_headlines_path)
get_score(pred_arr,df_test['Stance'])

Model Score: 9815.25


In [53]:
import pandas as pd
test_headlines = pd.read_csv(test_headlines_path).drop(["Stance"], axis=1)
test_headlines["Stance"] = pred_arr
test_headlines.head()

Unnamed: 0,Headline,Body ID,Stance
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated


In [55]:
test_headlines.to_csv("answer.csv",index=False)
from google.colab import files
files.download("answer.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>