# Load dataset

**Permit the access to the Shared Drives for import the dataset de dev-v2.0.sjon**

In [1]:
from google.colab import drive

drive.mount('/content/drive')

#The path of our shared drive in the folder named Proc Lleng Natural
data_path = '/content/drive/Shared drives/Proc Lleng Natural/Project_data/vcr1annots'

!ls '/content/drive/Shared drives/Proc Lleng Natural/Project_data/vcr1annots'

KeyboardInterrupt: ignored

In [None]:
import json
import pandas as pd

#Load the dataset 
train_data = data_path + 'dev-v2.0.json'

with open(train_data, 'r') as f:
  data = json.load(f)


# Part 1

**1. Select 100 paragraphs, they are called contexts in the SQuAD dataset**

In [None]:
paragraphs = list()
 
#Create a list with paragraphs "context"
for i in range(len(data['data'])):
  for j in range(len(data['data'][i]['paragraphs'])):
    if(len(paragraphs) < 100):
      paragraphs.append(data['data'][i]['paragraphs'][j]['context'])

#See the results
context_data = pd.DataFrame(paragraphs)
context_data


Unnamed: 0,0
0,The Normans (Norman: Nourmands; French: Norman...
1,"The Norman dynasty had a major political, cult..."
2,"The English name ""Normans"" comes from the Fren..."
3,"In the course of the 10th century, the initial..."
4,"Before Rollo's arrival, its populations did no..."
...,...
95,Many locals and tourists frequent the southern...
96,"""Southern California"" is not a formal geograph..."
97,Though there is no official definition for the...
98,"Subsequently, Californios (dissatisfied with i..."


**2. Create a list of answers for each paragraph**

In [None]:
#Generate the list with the answers for each context
list_answers = list() 
paragraphs = list()
 
#Create a list with paragraphs "context"
for i in range(len(data['data'])):
  for j in range(len(data['data'][i]['paragraphs'])):
    if(len(paragraphs) < 100):
      paragraphs.append(data['data'][i]['paragraphs'][j]['context'])
      for k in range(len(data['data'][i]['paragraphs'][j]['qas'])):
        for l in range(len(data['data'][i]['paragraphs'][j]['qas'][k]['answers'])):
          list_answers.append(data['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['text'])
          break

#See the results
answers_data = pd.DataFrame(list_answers)
answers_data      

Unnamed: 0,0
0,France
1,10th and 11th centuries
2,"Denmark, Iceland and Norway"
3,Rollo
4,10th century
...,...
351,Los Angeles Times
352,1900
353,1999
354,Imperial


### Spicy Model

**3. Run the model for named entity recognition/concept extraction and extract
entities in each paragraph**


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
entities = list()

for i in range(len(paragraphs)):
  doc = nlp(paragraphs[i])
  for ent in doc.ents:
    if ent.text not in entities:
      entities.append(ent.text) 

#See the results
entities_data = pd.DataFrame(entities)
entities_data      

Unnamed: 0,0
0,Normans
1,Norman
2,Nourmands
3,French
4,Latin
...,...
600,1900
601,the Los Angeles Times
602,1999
603,Times


**4. For each paragraph find out the number of answers that are in a list of
extracted entities and sum up these numbers**

In [None]:
number_entities_context = 0

for i in range(len(list_answers)):
  for j in range(len(entities)):
    if (list_answers[i] == entities[j]):
     number_entities_context = number_entities_context + 1

print("The number of entities in the answers are:", number_entities_context)

The number of entities in the answers are: 106


**5. Return a single value that is the proportion of answers that are entities.**

In [None]:
proportion_of_entities = 0

proportion_of_entities = number_entities_context / len(list_answers)

print("The proportion is: ", proportion_of_entities)

The proportion is:  0.29775280898876405


### Nltk Model

**3. Run the model for named entity recognition/concept extraction and extract entities in each paragraph**

In [None]:
import nltk

import string
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('words')

entities_nltk = list()

def ntlk_entity(text):
    for chunk in chunks: 
      if hasattr(chunk,'label'):
        entities_nltk.append(' '.join(c[0] for c in chunk))

for i in range(len(paragraphs)):
  words = nltk.word_tokenize(paragraphs[i])
  pos_tags = nltk.pos_tag(words)
  chunks = nltk.ne_chunk(pos_tags, binary=True)
  ntlk_entity(chunks)

#See the results
entities_df = pd.DataFrame(entities_nltk)
entities_df

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Unnamed: 0,0
0,Normans
1,Norman
2,French
3,Normandy
4,France
...,...
775,Riverside
776,San Diego
777,Ventura
778,Santa Barbara


**4. For each paragraph find out the number of answers that are in a list of
extracted entities and sum up these numbers**

In [None]:
number_entities_context = 0

for i in range(len(list_answers)):
  for j in range(len(entities_nltk)):
    if (list_answers[i] == entities_nltk[j]):
     number_entities_context = number_entities_context + 1

print("The number of entities in the answers are:", number_entities_context)

The number of entities in the answers are: 273


**5. Return a single value that is the proportion of answers that are entities.**

In [None]:
proportion_of_entities = 0

proportion_of_entities = number_entities_context / len(list_answers)

print("The proportion is: ", proportion_of_entities)

The proportion is:  0.7668539325842697


# Part 2

**1. Select 100 paragraphs that have at least one corresponding “who” question.**

In [None]:
paragraphs_who_list = list()

#Create a list with paragraphs "context"
for i in range(len(data['data'])):
  for j in range(len(data['data'][i]['paragraphs'])):
    if data['data'][i]['paragraphs'] not in paragraphs_who_list:
      if len(paragraphs_who_list) < 100:
        for k in range(len(data['data'][i]['paragraphs'][j]['qas'])):
          if data['data'][i]['paragraphs'][j]['qas'][k]['question'].startswith('Who'):
            paragraphs_who_list.append(data['data'][i]['paragraphs'][j]['context'])
            break

#See the results
context_data = pd.DataFrame(paragraphs_who_list)
display(context_data)


Unnamed: 0,0
0,The Normans (Norman: Nourmands; French: Norman...
1,"The Norman dynasty had a major political, cult..."
2,"In the course of the 10th century, the initial..."
3,"Before Rollo's arrival, its populations did no..."
4,The Normans thereafter adopted the growing feu...
...,...
95,In 1891 Scottish chemist James Dewar was able ...
96,By the late 19th century scientists realized t...
97,Paleoclimatologists measure the ratio of oxyge...
98,Hyperbaric (high-pressure) medicine uses speci...


**2. Create a list of answers only to “who” questions for each paragraph.**

In [None]:
answers_list = list()

#Create a list with paragraphs "context"
for i in range(len(data['data'])):
  for j in range(len(data['data'][i]['paragraphs'])):
        for k in range(len(data['data'][i]['paragraphs'][j]['qas'])):
          if data['data'][i]['paragraphs'][j]['qas'][k]['question'].startswith('Who'):
            for l in range(len(data['data'][i]['paragraphs'][j]['qas'][k]['answers'])):
              if data['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['text'] not in answers_list:
                answers_list.append(data['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['text'])
            break


#See the results
context_data = pd.DataFrame(answers_list)
display(context_data)

Unnamed: 0,0
0,Rollo
1,William the Conqueror
2,King Charles III
3,Seljuk Turks
4,"the Pechenegs, the Bulgars, and especially the..."
...,...
484,Albert Einstein
485,Isaac Newton
486,Galileo
487,Henry Cavendish


**3-4. Apply syntactic parsing for each paragraph and find answers in parse trees and assign a syntactic tag for each answer.**

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
parsing = list()

#Apply syntactic parsing for the list of paragraphs
for i in range(len(paragraphs_who_list)):
  doc = nlp(paragraphs_who_list[i])
  sentence_spans = list(doc.sents)
  displacy.render(sentence_spans, style = 'dep', jupyter = True)

**5. For each paragraph find out the number of “who” answers that appear in a position of the grammatical subject (nsubj) and sum up these numbers.**

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

counter_nsubj = 0

for i in range(len(paragraphs_who_list)):
  doc = nlp(paragraphs_who_list[i])
  
  for chunk in doc.noun_chunks:
    for answer in answers_list:
      if chunk.text == answer:
        if chunk.root.dep_ == 'nsubj':
          counter_nsubj = counter_nsubj + 1

print("The number of 'who' answers that are subjects (nsubj):", counter_nsubj)


The number of 'who' that appear in a position of the grammatical subject (nsubj): 38


**6. For each paragraph find out the number of “who” answers that appear in the position of the grammatical object (podj or dobj) and sum up these numbers.**

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

counter_podj_dobj = 0

for i in range(len(paragraphs_who_list)):
  doc = nlp(paragraphs_who_list[i])
  
  for chunk in doc.noun_chunks:
    for answer in answers_list:
      if chunk.text == answer:
        if chunk.root.dep_ == 'pobj' or chunk.root.dep_ == 'dobj':
          counter_podj_dobj = counter_podj_dobj + 1

print("The number of 'who' answers that are objects (podj or dobj ):", counter_podj_dobj)

The number of 'who' that appear in a position of the grammatical subject (podj or dobj ): 42


**7. Return two values that are the proportion of answers that are subjects and the proportion of answers that are objects.**

In [None]:
proportion_of_answers__nsubj = 0
proportion_of_answers__podj_dobj= 0

proportion_of_answers__nsubj = counter_nsubj / len(answers_list)

proportion_of_answers__podj_dobj = counter_podj_dobj / len(answers_list)


print("The proportion of subjects: ", proportion_of_answers__nsubj)
print("The proportion of objects: ", proportion_of_answers__podj_dobj)

The proportion of subjects:  0.07770961145194274
The proportion of objects:  0.08588957055214724
