In [76]:
import json
import numpy as np


with open("../../data/all_qa.json") as file:
    intents = json.load(file)

## Preparing the data

In [89]:
combined_list = []
questions_list = []
responses_list = []

for intent in intents:
    combined = f"{intent['document']} {'\n'.join(intent['patterns'])}-{'\n'.join(intent['responses'])}"
    combined_list.append(combined)
    questions_list.append('\n'.join(intent['patterns']))
    responses_list.append('\n'.join(intent['responses']))

In [90]:
print(len(combined_list), len(set(combined_list)))
print(len(questions_list), len(set(questions_list)))
print(len(responses_list), len(set(responses_list)))

533 533
533 533
533 533


In [79]:
combined_list = np.array(combined_list)
questions_list = np.array(questions_list)
responses_list = np.array(responses_list)

## Getting duplicated qa

[1:] after the where function, as we want to keep the first index.

In [80]:
from collections import Counter

queries_with_same_qa = list(filter(lambda x: x[1] > 1, Counter(combined_list).most_common()))
print(queries_with_same_qa)

if queries_with_same_qa:
    qa_index_to_delete = [np.where(combined_list == qa[0])[0][1:] for qa in queries_with_same_qa]
    qa_index_to_delete = np.concatenate(qa_index_to_delete)
    print(qa_index_to_delete)

[('Book 500 Data Science Interview Questions by Vamsee Puligadda What is logistic regression? Or State an example when you have used logistic regression recently.-Logistic Regression often referred as logit model is a technique to predict the binary outcome from a linear combination of predictor variables. For example, if you want to predict whether a particular political leader will win the election or not. In this case, the outcome of prediction is binary i.e. 0 or 1 (Win/Lose). The predictor variables here would be the amount of money spent for election campaigning of a particular candidate, the amount of time spent in campaigning, etc.', 2), ('Book 500 Data Science Interview Questions by Vamsee Puligadda What are Recommender Systems?-A subclass of information filtering systems that are meant to predict the preferences or ratings that a user would give to a product. Recommender systems are widely used in movies, news, research articles, products, social tags, music, etc.', 2)]
[401 

In [81]:
questions_list = np.delete(questions_list, qa_index_to_delete)
responses_list = np.delete(responses_list, qa_index_to_delete)

qa_index_to_delete.sort()

for ind_index in qa_index_to_delete[::-1]:
    intents.pop(ind_index)

## Union of the same question with different answer

In [82]:
queries_with_same_q = list(filter(lambda x: x[1] > 1, Counter(questions_list).most_common()))
print(queries_with_same_q)

if queries_with_same_q:
    if isinstance(questions_list, np.ndarray):
        same_q_index = [np.where(questions_list == q[0])[0] for q in queries_with_same_q]

        q_index_to_delete = []
        for same_q_index_batch in same_q_index:
            responses_same_q = [intents[index]['responses'] for index in same_q_index_batch[1:]]
            responses_same_q = [x for xs in responses_same_q for x in xs]
            intents[same_q_index_batch[0]]['responses'].extend(responses_same_q)
            q_index_to_delete.extend(same_q_index_batch[1:])
        print(q_index_to_delete)

[('What do you understand by the term Normal Distribution?', 3), ('What is the difference between supervised and unsupervised machine learning?', 2), ('What Is Linear Regression?', 2), ('Python or R – Which one would you prefer for text analytics?', 2), ('What are Recommender Systems?', 2), ('During analysis, how do you treat missing values?', 2), ('Can you cite some examples where both false positive and false negatives are equally important?', 2), ('How will you define the number of clusters in a clustering algorithm?', 2), ('A certain couple tells you that they have two children, at least one of which is a girl. What is the probability that they have two girls?', 2)]
[401, 419, 386, 291, 423, 439, 444, 431, 445, 500]


In [83]:
responses_list = np.delete(responses_list, q_index_to_delete)

q_index_to_delete.sort()

for ind_index in q_index_to_delete[::-1]:
    print(intents.pop(ind_index))

{'question_n': '427', 'patterns': ['A certain couple tells you that they have two children, at least one of which is a girl. What is the probability that they have two girls?'], 'responses': ['1/3'], 'document': 'Book 500 Data Science Interview Questions by Vamsee Puligadda', 'id': '2c4dc19bf4ee74e6545b7012af9c5925'}
{'question_n': '231', 'patterns': ['How will you define the number of clusters in a clustering algorithm?'], 'responses': ['Though the Clustering Algorithm is not specified, this question is mostly in reference to K-Means clustering where “K” defines the number of clusters. The objective of clustering is to group similar entities in a way that the entities within a group are similar to each other but the groups are different from each other.  For example, the following image shows three different groups.  ![](../../data/books/500 Data Science Interview Questions/500-Data-Science-Interview-Questions.pdf-77-0.png)  Within Sum of squares is generally used to explain the homog

## Analyzing duplicated answers with different questions

In [84]:
queries_with_same_a = list(filter(lambda x: x[1] > 1, Counter(responses_list).most_common()))
print(queries_with_same_a)

if queries_with_same_a:
    same_a_index = [np.where(responses_list == a[0])[0] for a in queries_with_same_a]
    same_a_index = np.concatenate(same_a_index)
    print(same_a_index)

[('Logistic Regression often referred as logit model is a technique to predict the binary outcome from a linear combination of predictor variables. For example, if you want to predict whether a particular political leader will win the election or not. In this case, the outcome of prediction is binary i.e. 0 or 1 (Win/Lose). The predictor variables here would be the amount of money spent for election campaigning of a particular candidate, the amount of time spent in campaigning, etc.', 2)]
[297 432]


Since there are only one duplicate, I will analyze it by hand.

In [85]:
intents[297]

{'question_n': '34',
 'patterns': ['What is logistic regression? Or State an example when you have used logistic regression recently.'],
 'responses': ['Logistic Regression often referred as logit model is a technique to predict the binary outcome from a linear combination of predictor variables. For example, if you want to predict whether a particular political leader will win the election or not. In this case, the outcome of prediction is binary i.e. 0 or 1 (Win/Lose). The predictor variables here would be the amount of money spent for election campaigning of a particular candidate, the amount of time spent in campaigning, etc.'],
 'document': 'Book 500 Data Science Interview Questions by Vamsee Puligadda',
 'id': '46eca99cf9df31d2ef2d5d9d8911c0ea'}

In [86]:
intents[432]

{'question_n': '224',
 'patterns': ['What is logistic regression? State an example when you have used logistic regression recently.'],
 'responses': ['Logistic Regression often referred as logit model is a technique to predict the binary outcome from a linear combination of predictor variables. For example, if you want to predict whether a particular political leader will win the election or not. In this case, the outcome of prediction is binary i.e. 0 or 1 (Win/Lose). The predictor variables here would be the amount of money spent for election campaigning of a particular candidate, the amount of time spent in campaigning, etc.'],
 'document': 'Book 500 Data Science Interview Questions by Vamsee Puligadda',
 'id': 'ffa93feaa7f0cc9cf6128c6b4bec50f9'}

It's the same question except for the Or at the first one after the `?`. I will only keep the second one.

In [87]:
intents.pop(297)

{'question_n': '34',
 'patterns': ['What is logistic regression? Or State an example when you have used logistic regression recently.'],
 'responses': ['Logistic Regression often referred as logit model is a technique to predict the binary outcome from a linear combination of predictor variables. For example, if you want to predict whether a particular political leader will win the election or not. In this case, the outcome of prediction is binary i.e. 0 or 1 (Win/Lose). The predictor variables here would be the amount of money spent for election campaigning of a particular candidate, the amount of time spent in campaigning, etc.'],
 'document': 'Book 500 Data Science Interview Questions by Vamsee Puligadda',
 'id': '46eca99cf9df31d2ef2d5d9d8911c0ea'}

## Saving the new all_qa_cleaned

In [88]:
with open("../../data/all_qa_cleaned.json", "w") as file:
    json.dump(intents, file)