In [None]:
!pip install transformers datasets

import torch
seed=0
import numpy as np
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
from pathlib import Path
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, load_metric
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from transformers import TrainingArguments
from transformers import Trainer
import gc
import pandas as pd

# Defining metrics
def metric_fn(predictions):
    preds = predictions.predictions.argmax(axis=1)
    labels = predictions.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 15.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 74.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 59.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.7

In [None]:
# Loading output dataset of question generation module
df = pd.read_csv('metric_df(beams=1).csv')
sample_df = df.sample(frac=1,random_state=2)
print(sample_df['generated'].value_counts())

# Split to train dev data set
train_metric = sample_df[:18000]
train_metric = train_metric.sample(frac=1,random_state=2)
dev_metric = sample_df[18000:]
dev_metric = dev_metric.sample(frac=1,random_state=2)
print(train_metric['generated'].value_counts())
print(dev_metric['generated'].value_counts())

# Creating csv for the model
train_metric.to_csv('train_metric.csv')
dev_metric.to_csv('dev_metric.csv')

1    10570
0    10570
Name: generated, dtype: int64
0    9009
1    8991
Name: generated, dtype: int64
1    1579
0    1561
Name: generated, dtype: int64


In [None]:
# Defining a model to check if can be distinguish between generated and non-generated questions
model_name = 'distilbert-base-cased' 
model_seq_classification = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Paths
train_path = 'train_metric.csv'
dev_path = 'dev_metric.csv'
test_path = 'test_metric.csv'

# Data files
data_files = {
    'train': str(train_path),
    'dev': str(dev_path)
}

datasets = load_dataset('csv', data_files=data_files)

dev_datafiles = {
    'dev': str(dev_path)
}

dev_dataset = load_dataset('csv', data_files=dev_datafiles)

#  Tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_datasets = datasets.map(tokenizer,
                                  input_columns='question',
                                  fn_kwargs={"max_length": 50, "truncation": True, "padding": "max_length"})
tokenized_datasets.set_format('torch')

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dev_dataset = dev_dataset.map(tokenizer,
                                  input_columns='question',
                                  fn_kwargs={"max_length": 50, "truncation": True, "padding": "max_length"})
tokenized_dev_dataset.set_format('torch')

for split in tokenized_datasets:
    tokenized_datasets[split] = tokenized_datasets[split].add_column('label', datasets[split]['generated'])
print(tokenized_datasets)

for split in tokenized_dev_dataset:
    tokenized_dev_dataset[split] = tokenized_dev_dataset[split].add_column('label', dev_dataset[split]['generated'])
print(tokenized_dev_dataset)

OUT_PATH = Path("trainDir")

args = TrainingArguments(output_dir=OUT_PATH,
                         overwrite_output_dir=True,
                         per_device_train_batch_size=64,
                         per_device_eval_batch_size=64,
                         save_strategy='epoch',
                         metric_for_best_model='eval_accuracy',
                         load_best_model_at_end=True,
                         greater_is_better=True,
                         evaluation_strategy='epoch',
                         do_train=True,
                         num_train_epochs=5, 
                         report_to='none')

trainer = Trainer(
    model=model_seq_classification,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['dev'],
    compute_metrics=metric_fn
)


loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 28996
}

loading weights file https://huggingface.co/distilbert-base-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-639631b026284c38/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-639631b026284c38/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Using custom data configuration default-b1c7d0f716ff3b90


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b1c7d0f716ff3b90/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b1c7d0f716ff3b90/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 28996
}

loading file https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.4

  0%|          | 0/18000 [00:00<?, ?ex/s]

  0%|          | 0/3140 [00:00<?, ?ex/s]

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 28996
}

loading file https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.4

  0%|          | 0/3140 [00:00<?, ?ex/s]

PyTorch: setting up devices


DatasetDict({
    train: Dataset({
        features: ['question', 'input_ids', 'attention_mask', 'label'],
        num_rows: 18000
    })
    dev: Dataset({
        features: ['question', 'input_ids', 'attention_mask', 'label'],
        num_rows: 3140
    })
})
DatasetDict({
    dev: Dataset({
        features: ['question', 'input_ids', 'attention_mask', 'label'],
        num_rows: 3140
    })
})


In [None]:
# Train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: question. If question are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 18000
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1410


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.54623,0.716242,0.741664,0.683957,0.810006
2,0.551200,0.517789,0.742675,0.773796,0.693427,0.875237
3,0.551200,0.573933,0.742675,0.769801,0.699637,0.855605
4,0.344700,0.720167,0.726115,0.761376,0.677531,0.868904
5,0.344700,0.845089,0.723885,0.753483,0.683695,0.839139


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: question. If question are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3140
  Batch size = 64
Saving model checkpoint to trainDir/checkpoint-282
Configuration saved in trainDir/checkpoint-282/config.json
Model weights saved in trainDir/checkpoint-282/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: question. If question are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3140
  Batch size = 64
Saving model checkpoint to trainDir/checkpoint-564
Configuration saved in trainDir/checkpoint-564/config.json
Model weights s

TrainOutput(global_step=1410, training_loss=0.3768798774015819, metrics={'train_runtime': 438.7898, 'train_samples_per_second': 205.11, 'train_steps_per_second': 3.213, 'total_flos': 1164264246000000.0, 'train_loss': 0.3768798774015819, 'epoch': 5.0})

In [None]:
# Check performance on dev dataset
dev_pred=trainer.predict(test_dataset=tokenized_dev_dataset['dev'])[0]

preds = dev_pred.argmax(axis=1)

true_labels=tokenized_dev_dataset['dev']['label']
acc = accuracy_score(true_labels, preds)
print(true_labels,preds,acc)

trainer.predict(test_dataset=tokenized_dev_dataset['dev'])

trainer.evaluate(eval_dataset=tokenized_datasets['dev'])

The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: question. If question are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3140
  Batch size = 64


The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: question. If question are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3140
  Batch size = 64


tensor([1, 0, 0,  ..., 1, 1, 0]) [0 0 0 ... 1 1 0] 0.7426751592356687


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: question. If question are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3140
  Batch size = 64


{'epoch': 5.0,
 'eval_accuracy': 0.7426751592356687,
 'eval_f1': 0.7737961926091825,
 'eval_loss': 0.5177887678146362,
 'eval_precision': 0.6934269944806823,
 'eval_recall': 0.8752374920835972,
 'eval_runtime': 5.3744,
 'eval_samples_per_second': 584.246,
 'eval_steps_per_second': 9.303}

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [None]:
# calculating BLEU score
bleu_score = 0

for i in range(0, len(df), 2):

    original = df['question'].iloc[i].split()
    generated = df['question'].iloc[i + 1].split()
    bleu_score = bleu_score + sentence_bleu([original], generated) 

print((bleu_score / (len(df)/2))*100)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


9.810749494412837


In [None]:
# Calculate average length of generated and non-generated question
df_by_generated = (df.groupby('generated')['question']
                            .apply(lambda x: np.mean(x.str.len()))
                            .reset_index(name='mean_len_question'))
print(df_by_generated)

   generated  mean_len_question
0          0          60.013718
1          1          51.733207


In [None]:
# Calculating BLEU score using the smoothing function (method 4 - compensating the differences generated and non generated question)
bleu_score = 0

for i in range(0, len(df), 2):

    bleu_score = bleu_score + sentence_bleu(
               [df['question'].iloc[i].split()],
               df['question'].iloc[i + 1].split(),
               smoothing_function=SmoothingFunction().method4
    )

print((bleu_score / (len(df)/2)*100))

14.767482555435919
