#Assignment 4
Orazi Filippo,
Rossolini Andrea 

In [None]:
# data and numerical management packages
import pandas as pd
import numpy as np

# useful during debugging (progress bars)
from tqdm import tqdm

# Keras packages
from keras import Sequential 
from keras.layers import Embedding, SimpleRNN, TimeDistributed, Dense, Bidirectional, Masking, LSTM, GRU, Input, Concatenate,Flatten, Add, Average, Dot, GlobalAveragePooling1D
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import Model 
from keras.utils import plot_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

# utils
from urllib import request
import zipfile
import gensim
import gensim.downloader as gloader
import scipy.sparse
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import requests
import zipfile


In [None]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

In [None]:
'''
Deviding dataset in training, test and validation set
'''
path = "./dataset/"

glob_vocabulary = set()
glob_tags = set()
bohs = set()

train_set = pd.read_csv(path + "train_pairs.csv" )
train_set.columns=["id","claim","evidence","id_claim","label"]

test_set = pd.read_csv(path + "test_pairs.csv" )
test_set.columns=["id","claim","evidence","id_claim","label"]

val_set = pd.read_csv(path + "val_pairs.csv" )
val_set.columns=["id","claim","evidence","id_claim","label"]



Y_train = train_set["label"]
X_train = train_set.drop(["label"], axis=1).drop(["id"], axis= 1)

Y_val = val_set["label"]
X_val = val_set.drop(["label"], axis=1).drop(["id"], axis= 1)

Y_test = test_set["label"]
X_test = test_set.drop(["label"], axis=1).drop(["id"], axis= 1)



#Preprocessing 


In [None]:
import re
from functools import reduce
import nltk
from nltk.corpus import stopwords

# Config

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;-\`\'\"–]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z. \t]')
STOPWORDS = set(["rrb", "lrb", "rsb", "lsb", "lcb", "rcb"])

def remove_number(text) :
    """
    Removes the number and tab in front of the text
    """
    pattern = r'[0-9]+?\t'
    return re.sub(pattern, '', text)

def remove_hyperlinks(text) :
    """
    Remove hyperlinks
    """
    pattern = r'.\t.*?$'
    return re.sub(pattern, '.', text)

def remove_pronunciations(text) :
    """
    Remove the characters used to indicate a pronunciation
    """
    pattern = r'-LSB-.*?-RSB-(\s;)*?'
    return re.sub(pattern, '', text)

def split_periods(text) :
    pattern = r'(\s.+?)\.'
    return re.sub(pattern, r'\1 .', text)

def remove_comma_thousands(text) :
    """
    Remove the comma used to indicate a number w/ more than three digits
    """
    pattern = r'([0-9]{1,3}),([0-9]{1,3})'
    text = re.sub(pattern, r'\1\2', text)
    pattern = r'([0-9]{1,3}),'
    return re.sub(pattern, r'\1', text)

def fix_date_merged(text) :
    """
    fixes dates and days marged with other words
    """
    pattern = r'([0-9]{1,4})([a-zA-Z]+?)'
    return re.sub(pattern, r'\1 \2', text)

def remove_repeated_ending_periods(text) :
    """
    Remove words at the end of a period when these are repeated
    """
    pattern = r'([a-zA-Z]{1,2}\.)\.$'
    text = re.sub(pattern, '\1 .', text)
    pattern = r'\.\.$'
    return re.sub(pattern, '.', text)

def lower(text):
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """
    return text.lower()

def replace_special_characters(text):
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """
    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text):
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """
    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text):
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def strip_text(text):
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """
    return text.strip()

PREPROCESSING_PIPELINE = [
                          remove_number,
                          remove_hyperlinks,
                          remove_pronunciations,
                          split_periods,
                          remove_comma_thousands,
                          fix_date_merged,
                          remove_repeated_ending_periods,
                          lower,
                          replace_special_characters,
                          filter_out_uncommon_symbols,
                          remove_stopwords,
                          strip_text
                          ]

# Anchor method

def text_prepare(text, filter_methods=None):
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)

# Pre-processing

print('Pre-processing text...')

print("Preprocessing training_set")
train_set['evidence'] = train_set['evidence'].apply(lambda txt: text_prepare(txt))
train_set['claim'] = train_set['claim'].apply(lambda txt: text_prepare(txt))
print("Training_set done")

print("Preprocessing val_set")
val_set['evidence'] = val_set['evidence'].apply(lambda txt: text_prepare(txt))
val_set['claim'] = val_set['claim'].apply(lambda txt: text_prepare(txt))
print("Val_SET done")

print("Preprocessing test_set")
test_set['evidence'] = test_set['evidence'].apply(lambda txt: text_prepare(txt))
test_set['claim'] = test_set['claim'].apply(lambda txt: text_prepare(txt))
print("Test_set done")



print("Pre-processing completed!")
train_set.head()

Pre-processing text...
Preprocessing training_set
Training_set done
Preprocessing val_set
Val_SET done
Preprocessing test_set
Test_set done
Pre-processing completed!


Unnamed: 0,id,claim,evidence,id_claim,label
0,0,chris hemsworth appeared in a perfect getaway .,hemsworth has also appeared in the science fic...,3,SUPPORTS
1,1,roald dahl is a writer .,roald dahl 13 september 1916 23 november 1990 ...,7,SUPPORTS
2,2,roald dahl is a governor .,roald dahl 13 september 1916 23 november 1990 ...,8,REFUTES
3,3,ireland has relatively lowlying mountains .,the island s geography comprises relatively lo...,9,SUPPORTS
4,4,ireland does not have relatively lowlying moun...,the island s geography comprises relatively lo...,10,REFUTES


#Tokenizazione

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import gensim.downloader as api
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.toktok import ToktokTokenizer

EMB_DIM = 50
EMBEDDING_SIZE = 50
embedding_model = api.load(f"glove-wiki-gigaword-{EMB_DIM}")



In [None]:
def tokenizer(data_set):
  toke = pd.DataFrame(columns=["claim", "evidence", "label"])
  toke["claim"] = data_set["claim"].apply(ToktokTokenizer().tokenize)
  toke["evidence"] = data_set["evidence"].apply(ToktokTokenizer().tokenize)
  toke["label"] = data_set["label"]
  return toke

train_toke = tokenizer(train_set)
val_toke = tokenizer(val_set)
test_toke = tokenizer(test_set)

##Building vocabulary

In [None]:
voc = set(embedding_model.vocab.keys())

def build_vocab(df) :
  res = []
  for col in ["claim", "evidence"] :
    for r in df[col] :
      res += r
  res = pd.unique(res)
  return res

train_voc = np.array(build_vocab(train_toke), dtype=str)   
voc.update(train_voc)
val_voc = np.array(build_vocab(val_toke), dtype=str)   
voc.update(val_voc)
test_voc = np.array(build_vocab(test_toke), dtype=str)   
voc.update(test_voc)

vocab_len = len(voc)
word_to_index = dict(zip(voc, range(1, len(voc)+1))) 

##Encoding

In [None]:
def encode(toke, word_to_idx) :
  encoded = pd.DataFrame(columns=["claim", "evidence", "label"])
  encoded["claim"] = toke["claim"].apply(lambda s: [word_to_idx[w] for w in s])
  encoded["evidence"] = toke["evidence"].apply(lambda s:[word_to_idx[w] for w in s])
  encoded["label"] = toke["label"].apply(lambda x: 1 if x=="SUPPORTS" else 0)
  return encoded

train_encoded = encode(train_toke, word_to_index)
val_encoded = encode(val_toke, word_to_index)
test_encoded = encode(test_toke, word_to_index)

In [None]:
train_encoded["claim"], train_encoded["evidence"]

(0         [304403, 172010, 110904, 310387, 26881, 211718...
 1              [73008, 75225, 283905, 26881, 136216, 76359]
 2              [73008, 75225, 283905, 26881, 317431, 76359]
 3             [403641, 355055, 296257, 394446, 3386, 76359]
 4         [403641, 94455, 282144, 151502, 296257, 394446...
                                 ...                        
 121735    [141014, 158490, 334874, 7879, 21017, 78309, 1...
 121736    [21017, 78309, 283905, 231567, 171025, 206507,...
 121737    [21017, 78309, 283905, 394042, 309007, 192518,...
 121738    [21017, 78309, 158490, 342203, 211434, 334874,...
 121739    [21017, 78309, 158490, 342203, 211434, 334874,...
 Name: claim, Length: 121740, dtype: object,
 0         [172010, 355055, 380225, 110904, 310387, 33487...
 1         [73008, 75225, 352357, 112469, 14105, 389836, ...
 2         [73008, 75225, 352357, 112469, 14105, 389836, ...
 3         [334874, 269566, 52496, 176056, 275589, 296257...
 4         [334874, 269566, 52496, 17605

##Padding

In [None]:
MAX_SEQ_LENGTH_CLAIM = np.int(np.max([len(a) for a in train_encoded["claim"]]))
MAX_SEQ_LENGTH_EVIDENCE = np.int(np.max([len(a) for a in train_encoded["evidence"]]))

'''claim'''
X_train_claim_padded = pad_sequences(train_encoded["claim"], maxlen=MAX_SEQ_LENGTH_CLAIM, padding="post", truncating="post")
X_val_claim_padded = pad_sequences(val_encoded["claim"], maxlen=MAX_SEQ_LENGTH_CLAIM, padding="post", truncating="post")
X_test_claim_padded = pad_sequences(test_encoded["claim"], maxlen=MAX_SEQ_LENGTH_CLAIM, padding="post", truncating="post")

'''evidence'''
X_train_evidence_padded = pad_sequences(train_encoded["evidence"], maxlen=MAX_SEQ_LENGTH_EVIDENCE, padding="post", truncating="post")
X_val_evidence_padded = pad_sequences(val_encoded["evidence"], maxlen=MAX_SEQ_LENGTH_EVIDENCE, padding="post", truncating="post")
X_test_evidence_padded = pad_sequences(test_encoded["evidence"], maxlen=MAX_SEQ_LENGTH_EVIDENCE, padding="post", truncating="post")


In [None]:
encoded_Y_train = train_encoded["label"]
encoded_Y_val = val_encoded["label"]

##Building embedding matrix


In [None]:
emmb_mat =  [np.zeros((EMB_DIM))]

for w in voc :
  if w in embedding_model.vocab :
    emmb_mat.append(embedding_model[w])
  else:
    emmb_mat.append(np.random.uniform(low=-1.0, high=1.0, size=EMB_DIM))
emmb_mat = np.array(emmb_mat)
emmb_mat.shape

(405252, 50)

#NN building blocks
In the following section we are going to define all the layers and the functions needed to build the different models.


##Inputs

In [None]:
input_claim = Input(shape=(MAX_SEQ_LENGTH_CLAIM), name='input_claim', dtype=tf.int32)
input_evidence = Input(shape=(MAX_SEQ_LENGTH_EVIDENCE), name='input_evidence', dtype=tf.int32)
MAX_SEQ_LENGTH_CLAIM, MAX_SEQ_LENGTH_EVIDENCE

(68, 120)

##Embedding

In [None]:
def embedding_layer_getter(input_layer_claim, input_layer_evidence, embedding_matrix):
  """
  Returns two embedding layers, one for claim and one for evidence
  """
  layer_emb = Embedding(
        embedding_matrix.shape[0],    # vocab size 
        embedding_matrix.shape[1],    # embedding dimension
        weights = [embedding_matrix],
        mask_zero = True,
        name = "Embedding_layer",
        trainable = False
    )
  claim_emb = layer_emb(input_layer_claim)
  evidence_emb = layer_emb(input_layer_evidence)
  return (claim_emb, evidence_emb)

##Sentence embedding


In [None]:
def RNN_layer_monodirectional(claim_emb, evidence_emb):
  """
  Returns two layers with a monodirectional LSTM
  """
  claim_output, claim_sentence_emb, _ = LSTM(64, dropout = 0.2,  return_sequences=True, return_state=True)(claim_emb)
  evidence_output, evidence_sentence_emb, _ = LSTM(64, dropout = 0.2,  return_sequences=True, return_state= True)(evidence_emb)
  return (claim_sentence_emb, evidence_sentence_emb)

def RNN_layer_bidirectional(claim_emb, evidence_emb):
  """
  Returns two layers with a bidirectional LSTM
   - concatenates claim_forward + claim_backward
   - concatenates evidence_forward + evidence_backward
  """
  claim_output, claim_forward, _, claim_backward, _  = Bidirectional(LSTM(64, dropout = 0.2,  return_sequences=True, return_state=True))(claim_emb)
  evidence_output, evidence_forward, _ , evidence_backward, _ = Bidirectional(LSTM(64, dropout = 0.2,  return_sequences=True, return_state= True))(evidence_emb)
  claim_sentence_emb = Concatenate()([claim_forward, claim_backward])
  evidence_sentence_emb = Concatenate()([evidence_forward, evidence_backward])
  return (claim_sentence_emb, evidence_sentence_emb)

def RNN_layer_bidirectional_average(claim_emb, evidence_emb):
  """
  Returns two layers with a bidirectional LSTM
   - concatenates claim_forward + claim_backward through a GlobalAveragePooling
   - concatenates evidence_forward + evidence_backward through a GlobalAveragePooling
  """
  claim_output = Bidirectional(LSTM(64, dropout = 0.2,  return_sequences=True))(claim_emb)
  evidence_output = Bidirectional(LSTM(64, dropout = 0.2,  return_sequences=True))(evidence_emb)
  claim_pooled = GlobalAveragePooling1D()(claim_output)
  evidence_pooled = GlobalAveragePooling1D()(evidence_output)
  return (claim_pooled, evidence_pooled)

def dot_layer_cosine_similarity(claim_emb, evidence_emb):
  """
  Returns a layer that computes the cosine similarity
  """
  return Dot(axes=1, normalize=True)([claim_emb, evidence_emb])

def bov_layer(claim_emb, evidence_emb):
  return GlobalAveragePooling1D()(claim_emb), GlobalAveragePooling1D()(evidence_emb)

def mlp_layer(embedding_layer):
  flattened = Flatten()(embedding_layer)
  d_1 = Dense(512)(flattened)
  d_2 = Dense(256)(d_1)
  d_3 = Dense(128)(d_2)
  d_4 = Dense(64)(d_3)
  return d_4

##Merging sentence embeddings


In [None]:
def input_concatenate(claim_sentence_emb, evidence_sentence_emb):
  """
  concatenate the two forward output to obtain a comprehensive claim + evidence sentence embedding
  """
  return Concatenate()([claim_sentence_emb, evidence_sentence_emb])

def input_sum(claim_forward, evidence_forward):
  """
  concatenate the two forward output to obtain a comprehensive claim + evidence sentence embedding
  """
  return Add()([claim_forward, evidence_forward])

def input_avg(claim_forward, evidence_forward):
  """
  average the previoous layerd
  """
  return Average()([claim_forward, evidence_forward])


##Binary classification Layer

In [None]:
def binary_classification_layer(layer_pre):
    return Dense(1, activation="sigmoid")(
        Dense(8,activation="relu")(
            Dense(16, activation="relu")(
                Dense(32, activation="relu")(
                    Dense(64, activation="relu")(
                        Dense(128, activation="relu")(layer_pre))))))

#Models

In [None]:
def get_model(input_claim, input_evidence, emmb_mat, s_emb = "bov", merge = "conc", cosine = False):
  embedding_layer = embedding_layer_getter(input_claim, input_evidence, emmb_mat)
  '''
  single sentence embedding
  '''
  if s_emb == "RNN":
    sentence_emb = RNN_layer_bidirectional(*embedding_layer)
  elif s_emb == "avg":
    sentence_emb = RNN_layer_bidirectional_average(*embedding_layer)
  elif s_emb == "bov":
    sentence_emb = bov_layer(*embedding_layer)
  elif s_emb == "MLP":
    sentence_emb = (mlp_layer(embedding_layer[0]), mlp_layer(embedding_layer[1]))  
  '''
  sentences merge
  '''
  if merge == "conc":
    merged_input = input_concatenate(*sentence_emb)
  elif merge == "sum":
    merged_input = input_sum(*sentence_emb)
  elif merge == "avg":
    merged_input = input_avg(*sentence_emb)

  '''
  cosine
  '''
  if cosine:
    cosine = dot_layer_cosine_similarity(*sentence_emb)
    merged_input = input_concatenate(merged_input, cosine)

  model_layers = binary_classification_layer(merged_input)
  return Model(inputs = [input_claim, input_evidence], 
            outputs= [model_layers])


In [None]:
s_embs = ["RNN","avg","bov","MLP"]
merges = ["conc", "sum", "avg"]

print("Correctness test")
for s_emb in s_embs:
  for merge in merges:
    model_tmp = get_model(input_claim, input_evidence, emmb_mat, s_emb, merge, cosine = False)
    model_tmp = get_model(input_claim, input_evidence, emmb_mat, s_emb, merge, cosine = True)
print("Perfectly working")

Correctness test
Perfectly working


In [None]:
model_1 = get_model(input_claim, input_evidence, emmb_mat,s_emb="avg", cosine= True)
plot_model(model_1, show_shapes=True)
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model_1.fit([X_train_claim_padded,X_train_evidence_padded], encoded_Y_train, batch_size=100, epochs=15, validation_data=([X_val_claim_padded, X_val_evidence_padded], encoded_Y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


# Performance evaluation

In [None]:
def get_caim_from_prediction(pred) :
  pred_claim = pd.DataFrame()
  pred_claim["id_claim"] =val_set["id_claim"]
  pred_claim["label"] = np.rint(pred)
  return pred_claim

def majority_evaluation(pred_claim) :
  cl_list = pd.DataFrame(columns=["id_claim", "result"])
  cl_list["id_claim"] = pd.unique(pred_claim["id_claim"])
  for id_claim in cl_list["id_claim"]:
    labels = pred_claim[pred_claim["id_claim"] == id_claim]
    supports = len(labels[labels["label"] == 1])
    refutes = len(labels[labels["label"] == 0])
    if supports > refutes :
      cl_list["result"][cl_list["id_claim"] == id_claim] = 1
    else :
      cl_list["result"][cl_list["id_claim"] == id_claim] = 0
  return cl_list

In [None]:
'''
We disabled some warnings that don't affect the operation
'''
import warnings
warnings.filterwarnings('ignore')

predictions = model_1.predict([X_val_claim_padded, X_val_evidence_padded])
pred = get_caim_from_prediction(predictions)
majority = majority_evaluation(pred)