In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# SQUAD JSON TO PANDAS DATAFRAME FOR TRAIN SET

In [None]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("Processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file , record_path )
    m = pd.json_normalize(file, record_path[:-1] )
    r = pd.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
# SQUAD JSON TO PANDAS DATAFRAME FOR TEST SET AS SOME QUESTIONS HAVE MULTIPLE ANSWERS

In [None]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file , record_path )
    m = pd.json_normalize(file, record_path[:-1] )
    r = pd.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
input_file_path = "/content/drive/MyDrive/AutomatedQuestionGeneration/SQUAD/train-v2.0.json"
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

Reading the json file
Processing...
shape of the dataframe is (130319, 6)
Done


In [None]:
train.head(1200)

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0
...,...,...,...,...,...,...
1195,56d3ac8e2ccc5a1400d82e1e,"Where was the fictionalized ""Chopin"" produced?",Possibly the first venture into fictional trea...,Milan,179.0,103
1196,56cf50b2aab44d1400b88fbd,An 1830 sonnet was written about Chopin by wha...,Chopin has figured extensively in Polish liter...,Leon Ulrich,205.0,104
1197,56cf50b2aab44d1400b88fbe,Aside from George Sands what two French author...,Chopin has figured extensively in Polish liter...,Marcel Proust and André Gide,275.0,104
1198,56cf50b2aab44d1400b88fbf,Leon Ulrich wrote about Chopin in what format?,Chopin has figured extensively in Polish liter...,sonnet,185.0,104


In [None]:
train["text"].value_counts()

three                                                          231
two                                                            206
four                                                           171
five                                                           133
six                                                             90
                                                              ... 
not reducing the taxes of the people who converted to Islam      1
the traceable amount rule                                        1
1.2 years                                                        1
factually                                                        1
7,456                                                            1
Name: text, Length: 64763, dtype: int64

In [None]:
train["text"].isna()

0         False
1         False
2         False
3         False
4         False
          ...  
130314     True
130315     True
130316     True
130317     True
130318     True
Name: text, Length: 130319, dtype: bool

In [None]:
train.iloc[4,2]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [None]:
# There are unanswerable questions --> (130319 - 86821)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         130319 non-null  object 
 1   question      130319 non-null  object 
 2   context       130319 non-null  object 
 3   text          86821 non-null   object 
 4   answer_start  86821 non-null   float64
 5   c_id          130319 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 6.0+ MB


In [None]:
# There are some repeating questions
len(train["question"].unique())

130217

In [None]:
train["text"].value_counts()

three                                                          231
two                                                            206
four                                                           171
five                                                           133
six                                                             90
                                                              ... 
not reducing the taxes of the people who converted to Islam      1
the traceable amount rule                                        1
1.2 years                                                        1
factually                                                        1
7,456                                                            1
Name: text, Length: 64763, dtype: int64

In [None]:
#Analysis of Unanswerable Questions in train set
unanswerable_train = train[train["text"].isna()]

In [None]:
len(unanswerable_train)

43498

In [None]:
unanswerable_train.head(16)

Unnamed: 0,index,question,context,text,answer_start,c_id
2075,5a8d7bf7df8bba001a0f9ab1,What category of game is Legend of Zelda: Aust...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2076,5a8d7bf7df8bba001a0f9ab2,What consoles can be used to play Australia Tw...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2077,5a8d7bf7df8bba001a0f9ab3,When was Australia Twilight launched in North ...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2078,5a8d7bf7df8bba001a0f9ab4,When could GameCube owners purchase Australian...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2079,5a8d7bf7df8bba001a0f9ab5,What year was the Legend of Zelda: Australian ...,The Legend of Zelda: Twilight Princess (Japane...,,,280
2086,5a8d800edf8bba001a0f9abb,What land does Ocarina serve to protect?,"The story focuses on series protagonist Link, ...",,,281
2087,5a8d800edf8bba001a0f9abc,What character helped Link in Ocarina Princess?,"The story focuses on series protagonist Link, ...",,,281
2088,5a8d800edf8bba001a0f9abd,Who is the protagonist of Midna of Time?,"The story focuses on series protagonist Link, ...",,,281
2089,5a8d800edf8bba001a0f9abe,From what alternate dimension does this dimens...,"The story focuses on series protagonist Link, ...",,,281
2090,5a8d800edf8bba001a0f9abf,When does the Waker take place?,"The story focuses on series protagonist Link, ...",,,281


In [None]:
unanswerable_train.iloc[2,2]

'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]'

In [None]:
for i in range(5):
  print(unanswerable_train.iloc[i,1])
  print()

What category of game is Legend of Zelda: Australia Twilight?

What consoles can be used to play Australia Twilight?

When was Australia Twilight launched in North America?

When could GameCube owners purchase Australian Princess?

What year was the Legend of Zelda: Australian Princess originally planned for release?



In [None]:
# importing test set
input_file_path = '/content/drive/MyDrive/AutomatedQuestionGeneration/SQUAD/dev-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
verbose = 0
dev = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (11873, 5)
Done


In [None]:
def testset_preprocessing(dev):
  def duplicate_removal(l):
    return [dict(t) for t in {tuple(d.items()) for d in l}] 

  dev["answers"] = dev["answers"].apply(duplicate_removal)

  id = []
  question = []
  context = []
  text = []
  answer_start = []
  c_id = []

  for i in range(len(dev)):
    if not (dev["answers"][i]): 
      id.append(dev["id"][i])
      question.append(dev["question"][i])
      context.append(dev["context"][i])
      text.append(None)
      answer_start.append(None)
      c_id.append(dev["c_id"][i])
    
    for j in range(len(dev["answers"][i])):
      id.append(dev["id"][i])
      question.append(dev["question"][i])
      context.append(dev["context"][i])
      text.append(dev["answers"][i][j]["text"])
      answer_start.append(dev["answers"][i][j]["answer_start"])
      c_id.append(dev["c_id"][i])
  
  return pd.DataFrame(zip(id,question,context,text,answer_start,c_id), columns=["index","question","context","text","answer_start","c_id"]).fillna(value=np.nan)

In [None]:
dev = testset_preprocessing(dev)

In [None]:
dev.head(20)

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159.0,0
1,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,in the 10th and 11th centuries,87.0,0
2,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,10th and 11th centuries,94.0,0
3,56ddde6b9a695914005b962a,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,"Denmark, Iceland and Norway",256.0,0
4,56ddde6b9a695914005b962b,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,Rollo,308.0,0
5,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,10th,671.0,0
6,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,10th century,671.0,0
7,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,the first half of the 10th century,649.0,0
8,5ad39d53604f3c001a3fe8d1,Who gave their name to Normandy in the 1000's ...,The Normans (Norman: Nourmands; French: Norman...,,,0
9,5ad39d53604f3c001a3fe8d2,What is France a region of?,The Normans (Norman: Nourmands; French: Norman...,,,0


In [None]:
dev[dev["text"].isna()]

Unnamed: 0,index,question,context,text,answer_start,c_id
8,5ad39d53604f3c001a3fe8d1,Who gave their name to Normandy in the 1000's ...,The Normans (Norman: Nourmands; French: Norman...,,,0
9,5ad39d53604f3c001a3fe8d2,What is France a region of?,The Normans (Norman: Nourmands; French: Norman...,,,0
10,5ad39d53604f3c001a3fe8d3,Who did King Charles III swear fealty to?,The Normans (Norman: Nourmands; French: Norman...,,,0
11,5ad39d53604f3c001a3fe8d4,When did the Frankish identity emerge?,The Normans (Norman: Nourmands; French: Norman...,,,0
16,5ad3a266604f3c001a3fea27,What type of major impact did the Norman dynas...,"The Norman dynasty had a major political, cult...",,,1
...,...,...,...,...,...,...
16318,5ad28a57d7d075001a4299b3,What does not change macroscopic closed systems?,The connection between macroscopic nonconserva...,,,1202
16329,5ad28ad0d7d075001a4299cc,What does not have a metric counterpart?,"The pound-force has a metric counterpart, less...",,,1203
16330,5ad28ad0d7d075001a4299cd,What is the force exerted by standard gravity ...,"The pound-force has a metric counterpart, less...",,,1203
16331,5ad28ad0d7d075001a4299ce,What force leads to a commonly used unit of mass?,"The pound-force has a metric counterpart, less...",,,1203


In [None]:
# There are some questions that are unanswerable --> (16333 - 10388)
dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16333 entries, 0 to 16332
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         16333 non-null  object 
 1   question      16333 non-null  object 
 2   context       16333 non-null  object 
 3   text          10388 non-null  object 
 4   answer_start  10388 non-null  float64
 5   c_id          16333 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 765.7+ KB


In [None]:
# TOPIC MODELLING ON SQUAD

In [None]:
!pip3 install bertopic[all]

Collecting bertopic[all]
[?25l  Downloading https://files.pythonhosted.org/packages/f6/9e/16678af67081452c01fcaeca5fd734a1033be2da0e9d40815ee742588ef4/bertopic-0.8.1-py2.py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 2.0MB/s 
[?25hCollecting hdbscan>=0.8.27
[?25l  Downloading https://files.pythonhosted.org/packages/32/bb/59a75bc5ac66a9b4f9b8f979e4545af0e98bb1ca4e6ae96b3b956b554223/hdbscan-0.8.27.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.4MB 7.4MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting numpy>=1.20.0
  Using cached https://files.pythonhosted.org/packages/3f/03/c3526fb4e79a793498829ca570f2f868204ad9a8040afcd72d82a8f121db/numpy-1.21.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Collecting umap-learn>=0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/75/69/85e7f950bb75792ad5d

In [None]:
from bertopic import BERTopic

In [None]:
unique_contexts = train["context"].unique()

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=True) # We need the probabilities to visualize
topics, _ = topic_model.fit_transform(unique_contexts)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3673.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90895153.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466081.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=516.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…






In [None]:
# Get the most frequent topics
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

3886 documents have not been classified
The other 15143 documents are 377 topics


In [None]:
topic_freq.head()

Unnamed: 0,Topic,Count
0,-1,3886
1,0,403
2,1,246
3,2,174
4,3,162


In [None]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 403 documents that are talking about topic ID 0


In [None]:
for i in range(10):
  topic_no = topic_freq['Topic'].iloc[i]
  print("Topic ", topic_no," : \n")
  print(topic_model.get_topic(topic_no))
  print("\n\n")

Topic  -1  : 

[('greek', 0.0014291144090495488), ('french', 0.0014069802815344376), ('british', 0.0014030932770233236), ('france', 0.001298056251990432), ('european', 0.0012914284565652492), ('german', 0.001247523958206216), ('government', 0.0011612986526384293), ('roman', 0.0010736120734079048), ('britain', 0.0010605645641611414), ('europe', 0.0010537588518273555)]



Topic  0  : 

[('buddhism', 0.01199060251225066), ('buddhist', 0.009512016991621406), ('buddha', 0.008104588198401846), ('hindu', 0.0057905933423091005), ('rajasthan', 0.00489524067298574), ('vedic', 0.004846018671718188), ('mughal', 0.004328062367539183), ('subcontinent', 0.0035501623450479945), ('sutras', 0.0034835230997401646), ('buddhas', 0.0032871259601667153)]



Topic  1  : 

[('presbyterian', 0.019031089435408325), ('protestant', 0.014490405360400381), ('catholic', 0.010533939572614984), ('christian', 0.007736965528970018), ('christianity', 0.0068260960114807714), ('christians', 0.006787650690161353), ('protesta

In [None]:
# Returns how spatially separated each topic is.
topic_model.visualize_topics()

In [None]:
new_topic = "Elon Musk"

In [None]:
# Find_Topic function returns the five most similar topics to the input text in the decreasing order
topic_model.find_topics(new_topic)

([65, 356, 99, 221, 215],
 [0.30188635478062975,
  0.2765843407181786,
  0.2763852754489783,
  0.26495829646953584,
  0.24646894450946719])

In [None]:
topic_model.find_topics(new_topic)[1][0]

0.30188635478062975

In [None]:
#Get_Topic returns the contents of each topic 
topic_model.get_topic(65)

[('mexican', 0.01752407521865159),
 ('hidalgo', 0.015344479765952407),
 ('mexico', 0.01413802031072275),
 ('valencia', 0.00901828650106507),
 ('governor', 0.008740798350072788),
 ('orozco', 0.006748888539163003),
 ('norte', 0.006023994993650683),
 ('durango', 0.005806360190805719),
 ('paso', 0.005549131282290155),
 ('manuel', 0.005144610057040906)]

In [None]:
# Heatmap on the correlation between each of the clustered topics
plt.figure(figsize=(200,200))
topic_model.visualize_heatmap()

<Figure size 14400x14400 with 0 Axes>

In [None]:
# Agglomerative Clustering on the topics clustered based on unique contexts
topic_model.visualize_hierarchy()

In [None]:
topic_model.get_params()

{'calculate_probabilities': True,
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f66921be890>,
 'hdbscan_model': HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
         approx_min_span_tree=True, cluster_selection_epsilon=0.0,
         cluster_selection_method='eom', core_dist_n_jobs=4,
         gen_min_span_tree=False, leaf_size=40,
         match_reference_implementation=False, memory=Memory(location=None),
         metric='euclidean', min_cluster_size=10, min_samples=None, p=None,
         prediction_data=True),
 'language': 'english',
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'top_n_words': 10,
 'umap_model': UMAP(a=None, angular_rp_forest=True, b=None, dens_frac=0.0, dens_lambda=0.0,
      dens_var_shift=0.1, densmap=False, disconnection_distance=None,
      force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
      local_connectivity=1.0, low_memory

In [None]:
topic_model.save("./topic_model")

In [None]:
topic_model.load("/content/drive/MyDrive/AutomatedQuestionGeneration/topic_model")

In [None]:
# TOPIC MODEL FILTERIZER

In [None]:
train.head(10)

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0
5,56bf6b0f3aeaaa14008c9603,In what R&B group was she the lead singer?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Destiny's Child,320.0,0
6,56bf6b0f3aeaaa14008c9604,What album made her a worldwide known artist?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Dangerously in Love,505.0,0
7,56bf6b0f3aeaaa14008c9605,Who managed the Destiny's Child group?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Mathew Knowles,360.0,0
8,56d43c5f2ccc5a1400d830a9,When did Beyoncé rise to fame?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0
9,56d43c5f2ccc5a1400d830aa,What role did Beyoncé have in Destiny's Child?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,lead singer,290.0,0


In [None]:
train.iloc[0,2]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [None]:
def topic_filterizer(question):
  if topic_model.find_topics(question)[1][0] >= 0.5:
    return True
  else:
    return False

new_train = train[train["question"].apply(topic_filterizer) == True].head()

In [None]:
train[train["question"].apply(topic_filterizer) == True]

In [None]:
c = 0
for i in range(len(unanswerable_train)):
  if topic_model.find_topics(unanswerable_train.iloc[i,1])[1][0] >= 0.3:
    c += 1
  else:
    print(i)
print("Count: ",c)

15
60
68
69
80
125
179
236
283
311
319
419
439
448
452
455
582
659
865
912
917
921
923
933
936
938
940
951
972
986
1007
1008
1219
1248
1265
1374
1378
1379
1406
1430
1436
1456
1474
1557
1746
1877
2062
2063
2069
2109
2131
2133
2141
2145
2147
2361
2399
2436
2561
2604
2638
2649
2666
2682
2683
2899
2910
2956
3152
3153
3220
3232
3353
3683
3873
3913
3932
3934
3943
3945
3961
3962
3963
3970
3971
3977
3984
4005
4015
4019
4053
4056
4115
4233
4239
4313
4314
4425
4456
4465
4477
4552
4699
4702
4770
4773
4799
4837
4999
5027
5067
5086
5195
5216
5346
5353
5408
5542
5610
5682
5683
5684
5847
5888
5928
5953
5989
6165
6187
6634
6643
6644
6645
6649
6723
6726
6727
6763
6767
6768
6769
6776
6777
6831
6880
6945
7584
7616
7618
7666
7726
7734
7736
7825
7827
7845
7862
7866
7883
7918
7987
8019
8035
8083
8084
8103
8122
8147
8234
8249
8265
8272
8273
8304
8307
8350
8469
8476
8547
8943
9151
9200
9201
9211
9213
9221
9276
9329
9498
9760
9767
9786
9788
9806
9819
9821
9839
9844
9865
9875
10029
10435
10549
10564
10574
10600

In [None]:
# Checking if a question returned as not related to any context (since similarity < threshold) is actually an ambiguous question

In [None]:
# Similarity of the topic most similar to that question
topic_model.find_topics(unanswerable_train.iloc[15,1])[1][0]

0.2981184876868368

In [None]:
# Question under analysis
unanswerable_train.iloc[15,1]

'What are the three main activities in The Legend of Zelda: Clawshot Princess?'

In [None]:
# Context for that particular question
unanswerable_train.iloc[15,2]

"The Legend of Zelda: Twilight Princess is an action-adventure game focused on combat, exploration, and item collection. It uses the basic control scheme introduced in Ocarina of Time, including context-sensitive action buttons and L-targeting (Z-targeting on the Wii), a system that allows the player to keep Link's view focused on an enemy or important object while moving and attacking. Link can walk, run, and attack, and will automatically jump when running off of or reaching for a ledge.[c] Link uses a sword and shield in combat, complemented with secondary weapons and items, including a bow and arrows, a boomerang, bombs, and the Clawshot (similar to the Hookshot introduced earlier in the The Legend of Zelda series).[d] While L-targeting, projectile-based weapons can be fired at a target without the need for manual aiming.[c]"

In [None]:
# Answer to that question
unanswerable_train.iloc[2649,3]

nan