In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# SQUAD JSON TO PANDAS DATAFRAME FOR TRAIN SET

In [None]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("Processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file , record_path )
    m = pd.json_normalize(file, record_path[:-1] )
    r = pd.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
# SQUAD JSON TO PANDAS DATAFRAME FOR TEST SET AS SOME QUESTIONS HAVE MULTIPLE ANSWERS

In [None]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file , record_path )
    m = pd.json_normalize(file, record_path[:-1] )
    r = pd.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
input_file_path = "/content/drive/MyDrive/AutomatedQuestionGeneration/SQUAD/train-v2.0.json"
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

Reading the json file
Processing...
shape of the dataframe is (130319, 6)
Done


In [None]:
train.head(1200)

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0
...,...,...,...,...,...,...
1195,56d3ac8e2ccc5a1400d82e1e,"Where was the fictionalized ""Chopin"" produced?",Possibly the first venture into fictional trea...,Milan,179.0,103
1196,56cf50b2aab44d1400b88fbd,An 1830 sonnet was written about Chopin by wha...,Chopin has figured extensively in Polish liter...,Leon Ulrich,205.0,104
1197,56cf50b2aab44d1400b88fbe,Aside from George Sands what two French author...,Chopin has figured extensively in Polish liter...,Marcel Proust and André Gide,275.0,104
1198,56cf50b2aab44d1400b88fbf,Leon Ulrich wrote about Chopin in what format?,Chopin has figured extensively in Polish liter...,sonnet,185.0,104


In [None]:
train["text"].value_counts()

three                       231
two                         206
four                        171
five                        133
six                          90
                           ... 
the Russian abbot Daniel      1
early stages                  1
regal vestments               1
Blancs-Moussis                1
Lake Stanley Draper           1
Name: text, Length: 64763, dtype: int64

In [None]:
train["text"].isna()

0         False
1         False
2         False
3         False
4         False
          ...  
130314     True
130315     True
130316     True
130317     True
130318     True
Name: text, Length: 130319, dtype: bool

In [None]:
train.iloc[4,2]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [None]:
# There are unanswerable questions --> (130319 - 86821)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         130319 non-null  object 
 1   question      130319 non-null  object 
 2   context       130319 non-null  object 
 3   text          86821 non-null   object 
 4   answer_start  86821 non-null   float64
 5   c_id          130319 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 6.0+ MB


In [None]:
# There are some repeating questions
len(train["question"].unique())

130217

In [None]:
train["text"].value_counts()

three                       231
two                         206
four                        171
five                        133
six                          90
                           ... 
the Russian abbot Daniel      1
early stages                  1
regal vestments               1
Blancs-Moussis                1
Lake Stanley Draper           1
Name: text, Length: 64763, dtype: int64

In [None]:
#Analysis of Unanswerable Questions in train set
unanswerable_train = train[train["text"].isna()]

In [None]:
unanswerable_train.head(5)

Unnamed: 0,index,question,context,text,answer_start,c_id
2075,5a8d7bf7df8bba001a0f9ab1,What category of game is Legend of Zelda: Aust...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2076,5a8d7bf7df8bba001a0f9ab2,What consoles can be used to play Australia Tw...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2077,5a8d7bf7df8bba001a0f9ab3,When was Australia Twilight launched in North ...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2078,5a8d7bf7df8bba001a0f9ab4,When could GameCube owners purchase Australian...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2079,5a8d7bf7df8bba001a0f9ab5,What year was the Legend of Zelda: Australian ...,The Legend of Zelda: Twilight Princess (Japane...,,,280


In [None]:
unanswerable_train.iloc[0,2]

'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]'

In [None]:
for i in range(5):
  print(unanswerable_train.iloc[i,1])
  print()

What category of game is Legend of Zelda: Australia Twilight?

What consoles can be used to play Australia Twilight?

When was Australia Twilight launched in North America?

When could GameCube owners purchase Australian Princess?

What year was the Legend of Zelda: Australian Princess originally planned for release?



In [None]:
# importing test set
input_file_path = '/content/drive/MyDrive/AutomatedQuestionGeneration/SQUAD/dev-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
verbose = 0
dev = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (11873, 5)
Done


In [None]:
def testset_preprocessing(dev):
  def duplicate_removal(l):
    return [dict(t) for t in {tuple(d.items()) for d in l}] 

  dev["answers"] = dev["answers"].apply(duplicate_removal)

  id = []
  question = []
  context = []
  text = []
  answer_start = []
  c_id = []

  for i in range(len(dev)):
    if not (dev["answers"][i]): 
      id.append(dev["id"][i])
      question.append(dev["question"][i])
      context.append(dev["context"][i])
      text.append(None)
      answer_start.append(None)
      c_id.append(dev["c_id"][i])
    
    for j in range(len(dev["answers"][i])):
      id.append(dev["id"][i])
      question.append(dev["question"][i])
      context.append(dev["context"][i])
      text.append(dev["answers"][i][j]["text"])
      answer_start.append(dev["answers"][i][j]["answer_start"])
      c_id.append(dev["c_id"][i])
  
  return pd.DataFrame(zip(id,question,context,text,answer_start,c_id), columns=["index","question","context","text","answer_start","c_id"]).fillna(value=np.nan)

In [None]:
dev = testset_preprocessing(dev)

In [None]:
dev.head(20)

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159.0,0
1,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,10th and 11th centuries,94.0,0
2,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,in the 10th and 11th centuries,87.0,0
3,56ddde6b9a695914005b962a,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,"Denmark, Iceland and Norway",256.0,0
4,56ddde6b9a695914005b962b,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,Rollo,308.0,0
5,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,10th,671.0,0
6,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,the first half of the 10th century,649.0,0
7,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,10th century,671.0,0
8,5ad39d53604f3c001a3fe8d1,Who gave their name to Normandy in the 1000's ...,The Normans (Norman: Nourmands; French: Norman...,,,0
9,5ad39d53604f3c001a3fe8d2,What is France a region of?,The Normans (Norman: Nourmands; French: Norman...,,,0


In [None]:
dev[dev["text"].isna()]

Unnamed: 0,index,question,context,text,answer_start,c_id
8,5ad39d53604f3c001a3fe8d1,Who gave their name to Normandy in the 1000's ...,The Normans (Norman: Nourmands; French: Norman...,,,0
9,5ad39d53604f3c001a3fe8d2,What is France a region of?,The Normans (Norman: Nourmands; French: Norman...,,,0
10,5ad39d53604f3c001a3fe8d3,Who did King Charles III swear fealty to?,The Normans (Norman: Nourmands; French: Norman...,,,0
11,5ad39d53604f3c001a3fe8d4,When did the Frankish identity emerge?,The Normans (Norman: Nourmands; French: Norman...,,,0
16,5ad3a266604f3c001a3fea27,What type of major impact did the Norman dynas...,"The Norman dynasty had a major political, cult...",,,1
...,...,...,...,...,...,...
16318,5ad28a57d7d075001a4299b3,What does not change macroscopic closed systems?,The connection between macroscopic nonconserva...,,,1202
16329,5ad28ad0d7d075001a4299cc,What does not have a metric counterpart?,"The pound-force has a metric counterpart, less...",,,1203
16330,5ad28ad0d7d075001a4299cd,What is the force exerted by standard gravity ...,"The pound-force has a metric counterpart, less...",,,1203
16331,5ad28ad0d7d075001a4299ce,What force leads to a commonly used unit of mass?,"The pound-force has a metric counterpart, less...",,,1203


In [None]:
# There are some questions that are unanswerable --> (16333 - 10388)
dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16333 entries, 0 to 16332
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         16333 non-null  object 
 1   question      16333 non-null  object 
 2   context       16333 non-null  object 
 3   text          10388 non-null  object 
 4   answer_start  10388 non-null  float64
 5   c_id          16333 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 765.7+ KB


In [None]:
# TOPIC MODELLING ON SQUAD

In [None]:
!pip3 install bertopic[all]

Collecting bertopic[all]
[?25l  Downloading https://files.pythonhosted.org/packages/f6/9e/16678af67081452c01fcaeca5fd734a1033be2da0e9d40815ee742588ef4/bertopic-0.8.1-py2.py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 8.3MB/s 
[?25hCollecting hdbscan>=0.8.27
[?25l  Downloading https://files.pythonhosted.org/packages/32/bb/59a75bc5ac66a9b4f9b8f979e4545af0e98bb1ca4e6ae96b3b956b554223/hdbscan-0.8.27.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.4MB 36.9MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/3b/fd/8a81047bbd9fa134a3f27e12937d2a487bd49d353a038916a5d7ed4e5543/sentence-transformers-2.0.0.tar.gz (85kB)
[K     |████████████████████████████████| 92kB 13.8MB/s 
Collecting umap-learn>=0.5.0
[?25l  Downloading https://files.p

In [None]:
from bertopic import BERTopic

In [None]:
unique_contexts = train["context"].unique()

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=True) # We need the probabilities to visualize
topics, _ = topic_model.fit_transform(unique_contexts)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3673.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90895153.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466081.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=516.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…






In [None]:
# Get the most frequent topics
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

4026 documents have not been classified
The other 15003 documents are 390 topics


In [None]:
topic_freq.head()

Unnamed: 0,Topic,Count
0,-1,4026
1,0,422
2,1,192
3,2,170
4,3,163


In [None]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 422 documents that are talking about topic ID 0


In [None]:
for i in range(5):
  topic_no = topic_freq['Topic'].iloc[i]
  print("Topic ", topic_no," : \n")
  print(topic_model.get_topic(topic_no))
  print("\n\n")

Topic  -1  : 

[('european', 0.0014052755443250042), ('french', 0.0013051788583538224), ('british', 0.001297740908764962), ('war', 0.0012816454739306303), ('greek', 0.0012407437760033507), ('political', 0.0012099728431926174), ('german', 0.0011777086566330643), ('school', 0.001145641827534731), ('roman', 0.0011085000570878584), ('france', 0.0011047741539948984)]



Topic  0  : 

[('buddhism', 0.011693002014968857), ('buddhist', 0.009116728429284226), ('buddha', 0.007599084162995761), ('hindu', 0.0053780221246273125), ('vedic', 0.004656653565639313), ('rajasthan', 0.004191620617379647), ('mughal', 0.00415841296480315), ('subcontinent', 0.0036041931446676497), ('sutras', 0.0033493601987609855), ('buddhas', 0.003160799238495442)]



Topic  1  : 

[('jews', 0.03393742873240306), ('jewish', 0.030212688744218015), ('israel', 0.020650374270478118), ('judaism', 0.012195218462972677), ('israeli', 0.008794905599843474), ('torah', 0.005946996788837914), ('jerusalem', 0.005616828656750831), ('isra