In [72]:
import os
import json

### Scarichiamo i file SQUAD v1.1

In [73]:
train = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
dev = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
evaluate = "https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py"

In [74]:
if not os.path.exists("./SQUAD/train-v1.1.json"):
    !wget $train && mv train-v1.1.json SQUAD/

if not os.path.exists("./SQUAD/dev-v1.1.json"):
    !wget $dev && mv dev-v1.1.json SQUAD/

if not os.path.exists("./SQUAD/evaluate-v1.1.py"):
    !wget $evaluate && mv evaluate-v1.1.py SQUAD/

### Carichiamoli in memoria

In [75]:
train_data = json.load(open("./SQUAD/train-v1.1.json"))
dev_data =  json.load(open("./SQUAD/dev-v1.1.json"))

### Analizziamone la struttura
1. Capiamo il tipo
2. Se è un dict controlliamo le sottostruttre
3. Se è una lista guardiamone la lunghezza ed il contenuto
4. Se è stringa stampiamone il contenuto
5. Per ogni Key,Values del punto 2 e per ogni oggetto del punto 3 ripetiamo il processo dal punto 1.

In [76]:
print("*** Train Stats ***")
for k,v in train_data.items():
    print(f"Key: '{k}', Value Type: {type(v)}")
    if type(v) == dict:
        print(f"\tValues keys: {v.keys()}")
    elif type(v) == list:
        print(f"\tLength of values list: {len(v)}")
    else:
        print(f"\tValue: {v}")
          
        

*** Train Stats ***
Key: 'data', Value Type: <class 'list'>
	Length of values list: 442
Key: 'version', Value Type: <class 'str'>
	Value: 1.1


In [77]:
print("*** Dev Stats ***")
for k,v in dev_data.items():
    print(f"Key: '{k}', Value Type: {type(v)}")
    if type(v) == dict:
        print(f"\tValues keys: {v.keys()}")
    elif type(v) == list:
        print(f"\tLength of values list: {len(v)}")
    else:
        print(f"\tValue: {v}")

*** Dev Stats ***
Key: 'data', Value Type: <class 'list'>
	Length of values list: 48
Key: 'version', Value Type: <class 'str'>
	Value: 1.1


In [78]:
passage = train_data["data"][0]
print(passage.keys())    

dict_keys(['title', 'paragraphs'])


In [79]:
print(passage["title"])
print(type(passage["paragraphs"]))

University_of_Notre_Dame
<class 'list'>


In [80]:
paragraph = passage["paragraphs"][0]
type(paragraph)

dict

In [81]:
paragraph.keys()

dict_keys(['context', 'qas'])

In [82]:
type(paragraph['context'])

str

In [83]:
paragraph['context']

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [84]:
type(paragraph['qas'])

list

In [85]:
qa = paragraph['qas'][0]
type(qa)

dict

In [86]:
qa.keys()

dict_keys(['answers', 'question', 'id'])

In [87]:
print(qa["id"])

5733be284776f41900661182


In [88]:
print(qa["question"])

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


In [89]:
print(qa["answers"])

[{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}]


In [90]:
{
    "version": str,
    "data": [
        {
            "title": str,
            "paragraphs": [
                {
                    "context": str,
                    "qas": [
                        {
                            "id": str,
                            "question": str,
                            "answers": [
                                {
                                    "answer_start": int,
                                    "text": str
                                }
                            ]
                        }
                    ]
                }    
            ]
        }
    ]
}

{'version': str,
 'data': [{'title': str,
   'paragraphs': [{'context': str,
     'qas': [{'id': str,
       'question': str,
       'answers': [{'answer_start': int, 'text': str}]}]}]}]}

### La struttura del Dev set è uguale al train quindi non rifacciamo l'esercizio e passiamo a rilevare delle statistiche sul dataset
1. Quanti passaggi
2. Quanti paragrafi in totale, per passaggio in media
3. Lunghezza media/min/max di ogni passaggio in caratteri e token (un token è un pezzo di testo tra due spazi)
4. Quante domande e quante risposte per ogni paragrafo in media
5. Dato che abbiamo visto che le 'answers' sono di tipo lista, quante risposte ci sono in media/min/max per ogni domanda


In [130]:
import numpy as np
def get_stats(data_dict):
    n_docs = 0
    n_ans = 0
    len_token = []
    len_chars = []
    par_list = []
    q_list = []
    q_to_context = {}
    q_to_context_noanswers = {}
    for doc in data_dict["data"]:
        n_docs += 1
        for par in doc["paragraphs"]:
            par_list.append(par["context"])
            len_token.append(len(par["context"].split(" ")))
            len_chars.append(len(par["context"]))
            for qa in par["qas"]:
                q_list.append(qa["question"])
                q_to_context[qa["question"]] = par["context"]
                if len(qa["answers"]) == 0:
                    q_to_context_noanswers[qa["question"]] = par["context"]
                for ans in qa["answers"]:
                    n_ans += 1
                    
    print(f"Docs: {n_docs}")
    print(f"Paragraphs: {len(par_list)}")
    print(f"Q&A: {len(q_list)}")
    print(f"Answers: {n_ans}")
    print(f"Mean number of Paragraphs per Document: {round(len(par_list)/n_docs,2)}")
    print(f"Min length in Tokens of a Paragraph: {np.min(len_token)}")
    print(f"Max length in Tokens of a Paragraph: {np.max(len_token)}")
    print(f"Mean length in Tokens of a Paragraph: {round(np.mean(len_token),2)}")
    print(f"Min length in Chars of a Paragraph: {np.min(len_chars)}")
    print(f"Max length in Chars of a Paragraph: {np.max(len_chars)}")
    print(f"Mean length in Chars of a Paragraph: {round(np.mean(len_chars),2)}")
    return par_list, q_list, q_to_context, q_to_context_noanswers
                             

In [137]:
print("Train Data:")
train_par_list, train_q_list, train_q_to_context, _ = get_stats(train_data)

Train Data:
Docs: 442
Paragraphs: 18896
Q&A: 87599
Answers: 87599
Mean number of Paragraphs per Document: 42.75
Min length in Tokens of a Paragraph: 20
Max length in Tokens of a Paragraph: 653
Mean length in Tokens of a Paragraph: 116.63
Min length in Chars of a Paragraph: 151
Max length in Chars of a Paragraph: 3706
Mean length in Chars of a Paragraph: 735.78


In [132]:
print("Dev Data:")
dev_par_list, dev_q_list, dev_q_to_context, _ = get_stats(dev_data)

Dev Data:
Docs: 48
Paragraphs: 2067
Q&A: 10570
Answers: 34726
Mean number of Paragraphs per Document: 43.06
Min length in Tokens of a Paragraph: 22
Max length in Tokens of a Paragraph: 629
Mean length in Tokens of a Paragraph: 122.76
Min length in Chars of a Paragraph: 157
Max length in Chars of a Paragraph: 4063
Mean length in Chars of a Paragraph: 774.33


## Adesso che sappiamo un pò di più della struttura del dataset, vediamo qualitativamente che dati contiene
Nelle prossime celle analizzeremo il sentiment dei paragrafi e delle domande contenute nel dataset

In [94]:
from transformers import pipeline
from tqdm import tqdm

In [95]:
def sentiment_analysis(paragraph_list):
    classifier = pipeline('sentiment-analysis', framework="tf", device=0)
    sentiment_recap = {}
    errors = []
    for txt in tqdm(paragraph_list):
        try:
            sent = classifier(txt)[0]
            label = sent["label"]
            if label not in sentiment_recap:
                sentiment_recap[label] = 0
            sentiment_recap[label] += 1
        except Exception as e:
            errors.append(e)
    print(f"Number of errors: {len(errors)}")
    if len(errors) > 0:
        print(errors)
    return sentiment_recap
    

In [26]:
train_sent_recap_par = sentiment_analysis(train_par_list)

  3%|▎         | 499/18896 [00:15<09:34, 32.04it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 18896/18896 [10:01<00:00, 31.39it/s]


In [49]:
print("Train Paragraph Sentiment Recap:")
print(train_sent_recap_par)

Train Paragraph Sentiment Recap:
{'POSITIVE': 9584, 'NEGATIVE': 9312}


In [43]:
dev_sent_recap_par = sentiment_analysis(dev_par_list)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 36%|███▋   

Number of errors: 0





In [50]:
print("Dev Paragraph Sentiment Recap:")
print(dev_sent_recap_par)

Dev Paragraph Sentiment Recap:
{'POSITIVE': 2067}


In [117]:
from sklearn.metrics.pairwise import cosine_similarity

def embedding_similarity(q_to_context, limit = 1000):
    feature_extractor = pipeline('feature-extraction', framework="tf", device=0)
    q_to_context_similarity = []
    question_keys = list(q_to_context.keys())
    np.random.shuffle(question_keys)
    selected_questions = question_keys[:limit]
    for question in tqdm(selected_questions):
        context = q_to_context[question]
        q_embedding = np.mean(feature_extractor(question)[0],axis=0)
        cntxt_embedding = np.mean(feature_extractor(context)[0],axis=0)
        q_to_context_similarity.append(cosine_similarity([q_embedding], [cntxt_embedding])[0][0])
        if len(q_to_context_similarity) >= limit:
            break
    
    print(f"Mean similarity between Question and Context: {np.mean(q_to_context_similarity)}")
    print(f"Min similarity between Question and Context: {np.min(q_to_context_similarity)}")
    print(f"Max similarity between Question and Context: {np.max(q_to_context_similarity)}")
    return q_to_context_similarity

In [118]:
q_to_context_similarity_train = embedding_similarity(train_q_to_context)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|█████████▉| 999/1000 [01:08<00:00, 14.62it/s]

Mean similarity between Question and Context: 0.8687367180927811
Min similarity between Question and Context: 0.5058140402280348
Max similarity between Question and Context: 0.9578740581088679





In [119]:
q_to_context_similarity_train = embedding_similarity(dev_q_to_context)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
 11%|█▏        | 114/1000 [00:07<01:02, 14.27it/s]Token indices sequence len

Mean similarity between Question and Context: 0.8726055091638873
Min similarity between Question and Context: 0.6852343126241149
Max similarity between Question and Context: 0.9556624586384194





### SQUAD v2.0 
Adesso analizziamo la distanza di embedding tra le domande senza risposta ed i loro rispettivi passaggi

In [121]:
squad_train_v2 = json.load(open("./train-v2.0.json"))
squad_dev_v2 = json.load(open("./dev-v2.0.json"))

In [133]:
train_v2_par_list, train_v2_q_list, train_v2_q_to_context, train_v2_q_to_context_noanswers = get_stats(squad_train_v2)

Docs: 442
Paragraphs: 19035
Q&A: 130319
Answers: 86821
Mean number of Paragraphs per Document: 43.07
Min length in Tokens of a Paragraph: 20
Max length in Tokens of a Paragraph: 653
Mean length in Tokens of a Paragraph: 116.59
Min length in Chars of a Paragraph: 151
Max length in Chars of a Paragraph: 3706
Mean length in Chars of a Paragraph: 735.55


In [134]:
dev_v2_par_list, dev_v2_q_list, dev_v2_q_to_context, dev_v2_q_to_context_noanswers = get_stats(squad_dev_v2)

Docs: 35
Paragraphs: 1204
Q&A: 11873
Answers: 20302
Mean number of Paragraphs per Document: 34.4
Min length in Tokens of a Paragraph: 25
Max length in Tokens of a Paragraph: 629
Mean length in Tokens of a Paragraph: 126.54
Min length in Chars of a Paragraph: 169
Max length in Chars of a Paragraph: 4063
Mean length in Chars of a Paragraph: 802.61


In [135]:
q_to_context_similarity_noanswer_train_v2 = embedding_similarity(train_v2_q_to_context_noanswers)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
 50%|█████     | 504/1000 [00:34<00:33, 14.81it/s]Token indices sequence len

Mean similarity between Question and Context: 0.8620405550912449
Min similarity between Question and Context: 0.6554168797417589
Max similarity between Question and Context: 0.9529050336541132





In [136]:
q_to_context_similarity_noanswer_dev_v2 = embedding_similarity(dev_v2_q_to_context_noanswers)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
 14%|█▍        | 144/1000 [00:09<00:57, 14.93it/s]Token indices sequence len

Mean similarity between Question and Context: 0.8638132165284282
Min similarity between Question and Context: 0.5489242513107354
Max similarity between Question and Context: 0.9522213888134166



