<a href="https://colab.research.google.com/github/ElFosco/NLP_argument_creation/blob/main/Argument_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clone the repo with the models


In [None]:
! git clone https://github.com/UKPLab/acl2019-BERT-argument-classification-and-clustering

In [None]:
# Install the correct version 
! pip install pytorch-pretrained-bert==0.6.2 sklearn scipy

#Load the model trained on the complete UKP Aspects Corpus and the dataset

In [None]:
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/2019_acl-BERT-argument-classification-and-clustering/models/argument_similarity_ukp_aspects_all.zip
! wget https://tudatalib.ulb.tu-darmstadt.de/bitstream/handle/tudatalib/1998/UKP_ASPECT.zip?sequence=1&isAllowed=y

#Load the model trained on the complete AFS corpus from Misra et al the dataset

In [None]:
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/2019_acl-BERT-argument-classification-and-clustering/models/argument_similarity_misra_all.zip
! wget http://nldslab.soe.ucsc.edu/afs16/Sigdial_16_release_data.zip

In [None]:
#! python /content/acl2019-BERT-argument-classification-and-clustering/argument-similarity/inference.py

In [None]:
import os
os.chdir('/content/acl2019-BERT-argument-classification-and-clustering/argument-similarity')

In [None]:
! mkdir models
#Unzip the models
! unzip /content/argument_similarity_ukp_aspects_all.zip -d models
! unzip /content/argument_similarity_misra_all.zip -d models
#Unzip the datasets
! unzip /content/UKP_ASPECT.zip?sequence=1 -d datasets
! unzip /content/Sigdial_16_release_data.zip -d datasets

## Training UKP Aspects Corpus

In [None]:
# we can choose whether to train on the entire dataset or on a subset(train/dev/test), we can test also on other datasets
! sh train_ukp.sh
# after this the output will be in the bert_output directory

## Training Argument Facet Similarity (AFS) Corpus

In [None]:
! sh train_misra.sh

In [None]:
#choose the model to use
model_path = 'models/misra_all' #@param {type:"string"}


max_seq_length = 64
eval_batch_size = 8

# An example of application with the model trained on the entire dataset

In [17]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from train import InputExample, convert_examples_to_features
from SigmoidBERT import SigmoidBERT


model_path = 'models/misra_all' #misra_all model: Trained on all 3 topics from Misra et al., 2016


max_seq_length = 64
eval_batch_size = 8

arguments = ['Cannabis should be legal because is not harmful substance',
             'Weed is a substance useful for the treatment of cancer.',
             'Zoos are detrimental to animals\' physical health.',
             'Zoo confinement is psychologically damaging to animals.',
             'Eating meat is not cruel or unethical; it is a natural part of the cycle of life. ',
             'It is cruel and unethical to kill animals for food when vegetarian options are available',
             'Overwhelming scientific consensus says human activity is primarily responsible for global climate change.',
             'Rising levels of human-produced gases released into the atmosphere create a greenhouse effect that traps heat and causes global warming.'
             ]

#Compare every argument with each other
input_examples = []
output_examples = []

for i in range(0, len(arguments)-1):
    for j in range(i+1, len(arguments)):
        input_examples.append(InputExample(text_a=arguments[i], text_b=arguments[j], label=-1))
        output_examples.append([arguments[i], arguments[j]])


tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
eval_features = convert_examples_to_features(input_examples, max_seq_length, tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SigmoidBERT.from_pretrained(model_path,)
model.to(device)
model.eval()

predicted_logits = []
with torch.no_grad():
    for input_ids, input_mask, segment_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        logits = model(input_ids, segment_ids, input_mask)
        logits = logits.detach().cpu().numpy()
        predicted_logits.extend(logits[:, 0])




for idx in range(len(predicted_logits)):
    output_examples[idx].append(predicted_logits[idx])

#Sort by similarity
output_examples = sorted(output_examples, key=lambda x: x[2], reverse=True)

print("Predicted similarities (sorted by similarity):")
for idx in range(len(output_examples)):
    example = output_examples[idx]
    print("Sentence A:", example[0])
    print("Sentence B:", example[1])
    print("Similarity:", example[2])
    print("")

loading vocabulary file models/misra_all/vocab.txt
:: Sentences longer than max_sequence_length: 0
:: Num sentences: 28
loading archive file models/misra_all
Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



Predicted similarities (sorted by similarity):
Sentence A: Zoos are detrimental to animals' physical health.
Sentence B: Zoo confinement is psychologically damaging to animals.
Similarity: 0.8723387

Sentence A: Eating meat is not cruel or unethical; it is a natural part of the cycle of life. 
Sentence B: It is cruel and unethical to kill animals for food when vegetarian options are available
Similarity: 0.77635074

Sentence A: Overwhelming scientific consensus says human activity is primarily responsible for global climate change.
Sentence B: Rising levels of human-produced gases released into the atmosphere create a greenhouse effect that traps heat and causes global warming.
Similarity: 0.67724043

Sentence A: Cannabis should be legal because is not harmful substance
Sentence B: Weed is a substance useful for the treatment of cancer.
Similarity: 0.59671324

Sentence A: Zoos are detrimental to animals' physical health.
Sentence B: It is cruel and unethical to kill animals for food wh