<a href="https://colab.research.google.com/github/ElFosco/NLP_score/blob/Fosco/Predictor_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import

In [None]:
import os
import shutil

import re
import numpy as np

import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt


#Data Exploration

In [None]:
#Using google drive to upload the data
from google.colab import drive
drive.mount('/content/drive')

#dir_path = "drive/MyDrive/NLP_project/Datasets/"
dir_path = "drive/MyDrive/Magistrale/NLP/Project/Data/"
dataset = "arg_quality_rank_30k.csv"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv(dir_path + dataset)
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517


In [None]:
set_topic = df.topic.unique()
dict_topic = {}

In [None]:
for i in set_topic:
  dict_topic[i] = df.loc[i==df['topic'],'topic'].values.size
sorted(dict_topic.items(), key=lambda x: x[1], reverse=True)

[('We should fight for the abolition of nuclear weapons', 554),
 ('We should legalize cannabis', 548),
 ('We should ban naturopathy', 540),
 ('Foster care brings more harm than good', 538),
 ('Blockade of the Gaza Strip should be ended', 521),
 ('We should legalize prostitution', 504),
 ('We should ban cosmetic surgery for minors', 502),
 ('We should legalize polygamy', 500),
 ('We should abolish the three-strikes laws', 499),
 ('We should end mandatory retirement', 484),
 ('We should abandon the use of school uniform', 480),
 ('Intelligence tests bring more harm than good', 472),
 ('We should abolish capital punishment', 470),
 ('Holocaust denial should be a criminal offence', 466),
 ('We should adopt a zero-tolerance policy in schools', 459),
 ('We should end affirmative action', 456),
 ('We should oppose collectivism', 454),
 ('We should close Guantanamo Bay detention camp', 447),
 ('Payday loans should be banned', 446),
 ('We should stop the development of autonomous cars', 446),
 

#Data Preprocessing

In [None]:
start_sentence = "^[a-zA-Z0-9_ ].*$"  # 
pattern_sentence = re.compile(start_sentence)



In [None]:
for index, row in df.iterrows():
   if pattern_sentence.match(row['argument']) is None:
      print(index)
      print(row['argument'])
      print('\n')

0
"marriage" isn't keeping up with the times.  abandon the old thinking and bring something that incorporates all unions - not just those with a man and woman.


1
.a multi-party system would be too confusing and getting a consensus from the general public would be difficult.


2
\ero-tolerance policy in schools should not be adopted as circumstances are often not black and white, being more nuanced. no one should be written off due to a mistake of judgement.


3
`people reach their limit when it comes to their quality of life and should be able to end their suffering.  this can be done with little or no suffering  by assistance and the person is able to say good bye.


337
a school has students of all types of religions and beliefs.  these beliefs encompass different entities to pray to.
banning school prayer allows all students to worship as they choose.


529
abuse and bullying are rife among foster children
it does not provide stability just temporary solutions


707
affirmative ac

##Data Split

In [None]:
is_training_data =  df['set']=='train'
is_validation_data =  df['set']=='dev'
is_test_data =  df['set']=='test'

training_data = df[is_training_data]
validation_data = df[is_validation_data]
test_data  = df[is_test_data ]

training_data = training_data[['argument','MACE-P']].reset_index(drop=True)

validation_data = validation_data[['argument','MACE-P']].reset_index(drop=True)

test_data = test_data[['argument','MACE-P']].reset_index(drop=True)

#[Bert](https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb)



In [None]:
pip install transformers



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertPreTrainedModel, BertModel
from transformers import AutoConfig, AutoTokenizer

from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
max_word = training_data['argument'].apply(lambda x: len(x.split())).max() #count the maximum number of words, maybe it's bet to pick as we have done in the esercitation

In [None]:
MODEL_OUT_DIR = '/kaggle/working/models/bert_regressor'
## Model Configurations
MAX_LEN_TRAIN = max_word
MAX_LEN_VALID = max_word
MAX_LEN_TEST = max_word
BATCH_SIZE = 64
LR = 1e-3
NUM_EPOCHS = 10
NUM_THREADS = 1  ## Number of threads for collecting dataset
MODEL_NAME = 'bert-base-uncased'

if not os.path.isdir(MODEL_OUT_DIR):
    os.makedirs(MODEL_OUT_DIR)

In [None]:
class Excerpt_Dataset(Dataset):

    def __init__(self, data, maxlen, tokenizer): 
        #Store the contents of the file in a pandas dataframe
        self.df = data.reset_index()
        #Initialize the tokenizer for the desired transformer model
        self.tokenizer = tokenizer
        #Maximum length of the tokens list to keep all the sequences of fixed size
        self.maxlen = maxlen

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):    
        #Select the sentence and label at the specified index in the data frame
        argument = self.df.loc[index, 'argument']
        mace_p = self.df.loc[index, 'MACE-P']
        #Preprocess the text to be suitable for the transformer
        tokens = self.tokenizer.tokenize(argument) 
        tokens = ['[CLS]'] + tokens + ['[SEP]'] 
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] 
        #Obtain the indices of the tokens in the BERT Vocabulary
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 
        input_ids = torch.tensor(input_ids)

        #Obtain the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attention_mask = (input_ids != 0).long()
        
        mace_p = torch.tensor(mace_p, dtype=torch.float32)
        
        return input_ids, attention_mask, mace_p

In [None]:
class BertRegresser(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        #The output layer that takes the [CLS] representation and gives an output
        self.cls_layer1 = nn.Linear(config.hidden_size,128)
        self.relu1 = nn.ReLU()
        self.ff1 = nn.Linear(128,128)
        self.tanh1 = nn.Tanh()
        self.ff2 = nn.Linear(128,1)

    def forward(self, input_ids, attention_mask):
        #Feed the input to Bert model to obtain contextualized representations
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        #Obtain the representations of [CLS] heads
        logits = outputs.last_hidden_state[:,0,:]
        output = self.cls_layer1(logits)
        output = self.relu1(output)
        output = self.ff1(output)
        output = self.tanh1(output)
        output = self.ff2(output)
        return output

In [None]:
def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
    best_acc = 0
    for epoch in trange(epochs, desc="Epoch"):
        print("Chiamo model.train")
        model.train()
        train_loss = 0
        for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
            optimizer.zero_grad()  
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss = criterion(output, target.type_as(output))
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        print(f"Training loss is {train_loss/len(train_loader)}")
        val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
        print("Epoch {} complete! Validation Loss : {}".format(epoch, val_loss))

In [None]:
def evaluate(model, criterion, dataloader, device):
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0

    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
            
            mean_loss += criterion(output, target.type_as(output)).item()
#             mean_err += get_rmse(output, target)
            count += 1
            
    return mean_loss/count

In [None]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
                        
            predicted_label += output
            actual_label += target
            
    return predicted_label

In [None]:
## Configuration loaded from AutoConfig 
config = AutoConfig.from_pretrained(MODEL_NAME)
## Tokenizer loaded from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
## Creating the model from the desired transformer model
model = BertRegresser.from_pretrained(MODEL_NAME, config=config)
## GPU or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## Putting model to device
model = model.to(device)
## Takes as the input the logits of the positive class and computes the binary cross-entropy 
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.MSELoss()
## Optimizer
optimizer = optim.Adam(params=model.parameters(), lr=LR)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertRegresser: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertRegresser from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertRegresser from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertRegresser were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['ff2.weight', 'cl

In [None]:
## Training Dataset
train_set = Excerpt_Dataset(data=training_data, maxlen=MAX_LEN_TRAIN, tokenizer=tokenizer)
valid_set = Excerpt_Dataset(data=validation_data, maxlen=MAX_LEN_VALID, tokenizer=tokenizer)
test_set = Excerpt_Dataset(data=test_data, maxlen=MAX_LEN_TEST, tokenizer=tokenizer)


## Data Loaders
train_loader = DataLoader(dataset=train_set, batch_size=300)
valid_loader = DataLoader(dataset=valid_set, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE)

In [None]:
train(model=model, 
      criterion=criterion,
      optimizer=optimizer, 
      train_loader=train_loader,
      val_loader=valid_loader,
      epochs = 5,
     device = device)

  return F.mse_loss(input, target, reduction=self.reduction)


Chiamo model.train


  return F.mse_loss(input, target, reduction=self.reduction)


Training loss is 0.15501523028526987


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch:  20%|██        | 1/5 [02:21<09:25, 141.26s/it]

Epoch 0 complete! Validation Loss : 0.14259063525527133
Chiamo model.train
Training loss is 0.14257483801671436


Epoch:  40%|████      | 2/5 [04:50<07:18, 146.14s/it]

Epoch 1 complete! Validation Loss : 0.1346602607883659
Chiamo model.train
Training loss is 0.137864502625806


Epoch:  60%|██████    | 3/5 [07:23<04:58, 149.03s/it]

Epoch 2 complete! Validation Loss : 0.13456060562063665
Chiamo model.train
Training loss is 0.13797956630587577


Epoch:  80%|████████  | 4/5 [09:57<02:30, 150.96s/it]

Epoch 3 complete! Validation Loss : 0.13459586483590744
Chiamo model.train
Training loss is 0.13781506291457585


Epoch: 100%|██████████| 5/5 [12:31<00:00, 150.24s/it]

Epoch 4 complete! Validation Loss : 0.13456775759365044





In [None]:
output = predict(model, train_loader, device)

In [None]:
out2=[]
for out in output:
    out2.append(out.cpu().detach().numpy())
print(out2)

[array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=float32), array([0.58678365], dtype=f

In [None]:
submission = pd.DataFrame({'argument': training_data['argument'], 'mace_p_predicted':out, 'real_mace_p': training_data['MACE-P']})

ValueError: ignored