#### Import Libraries

In [13]:
import pandas as pd
import numpy as np
import pickle
import spacy
from sklearn.metrics.pairwise import cosine_similarity

#### Import Data

In [14]:
text = "Brian is a doctor. He looks after sick people. He usually gets up at 6:00 o’clock. Today he is late, it is 6:30 and he is still in bed. He usually goes to work by train but today he is driving to work. He arrives at work at 6:30 every morning but it is 7:30 now and he is still driving. It’s 12:00 o’clock now. He always has his lunch at 12:00 but today he isn’t having lunch at 12:00, he is looking after his sick patients. It is half past seven now, Brian is watching TV. He usually watches TV at half past seven because his favorite programme starts at half past seven. Brian has his dinner at 8.30 everyday and he is having dinner now. It is 12:00 now Brian is going to bed. He always goes to bed at 12:00."

In [15]:
text

'Brian is a doctor. He looks after sick people. He usually gets up at 6:00 o’clock. Today he is late, it is 6:30 and he is still in bed. He usually goes to work by train but today he is driving to work. He arrives at work at 6:30 every morning but it is 7:30 now and he is still driving. It’s 12:00 o’clock now. He always has his lunch at 12:00 but today he isn’t having lunch at 12:00, he is looking after his sick patients. It is half past seven now, Brian is watching TV. He usually watches TV at half past seven because his favorite programme starts at half past seven. Brian has his dinner at 8.30 everyday and he is having dinner now. It is 12:00 now Brian is going to bed. He always goes to bed at 12:00.'

#### Load Reference Dictionary

In [16]:
with open('dict_cat0.pkl', 'rb') as file:  
    dict_cat0 = pickle.load(file)
    
dict_cat0_inv = {v: k for k, v in dict_cat0.items()}

In [17]:
with open('dict_cat1.pkl', 'rb') as file:  
    dict_cat1 = pickle.load(file)
    
dict_cat1_inv = {v: k for k, v in dict_cat1.items()}

#### Load Model

In [18]:
with open('model1.pkl', 'rb') as file:  
    model1 = pickle.load(file)

In [19]:
with open('model2.pkl', 'rb') as file:  
    model2 = pickle.load(file)

In [20]:
with open('tfidf.pickle', 'rb') as file:  
    vect = pickle.load(file)

In [21]:
qn = ['What does Brian do?',
      'What time does he usually get up?',
      'How does he usually go to work?',
      'Why is he driving to work today?',
      'What time does he arrive at work everyday?',
      'When does he always have his lunch?',
      'What is he doing at 12.00 today?',
      'Why does he usually watch TV at 7.30?',
      'What time is he going to bed now? ']

#### Qn Classification

In [22]:
qn_vector = vect.transform(qn)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [23]:
def predict_cat0(qn_vector):

    pred_model1, max_prob = [],[]
    prob = model1.predict_proba(qn_vector)

    for i in range(len(prob)):
        max_prob.append(prob[i].max())
        if prob[i].max() > 0.2:
            pred_model1.append(dict_cat0_inv[prob[i].argmax()])
        else:
            pred_model1.append(np.nan)
        
    return pred_model1

In [24]:
def predict_cat1(qn_vector):

    pred_model2, max_prob = [],[]
    prob = model2.predict_proba(qn_vector)

    for i in range(len(prob)):
        max_prob.append(prob[i].max())
        if prob[i].max() > 0.2:
            pred_model2.append(dict_cat1_inv[prob[i].argmax()])
        else:
            pred_model2.append(np.nan)
    
    return pred_model2

In [25]:
qn_classify = pd.DataFrame(index = qn, data = {'Category1' : predict_cat0(qn_vector)}) #  'Category2' : pred_model2 })
qn_classify.head()

Unnamed: 0,Category1
What does Brian do?,DESCRIPTION
What time does he usually get up?,NUMERIC
How does he usually go to work?,DESCRIPTION
Why is he driving to work today?,DESCRIPTION
What time does he arrive at work everyday?,NUMERIC


In [26]:
qn_classify_final = qn_classify.dropna()
qn_classify_final.index = [x.split('?')[0] for x in list(qn_classify_final.index)] ### Remove '?' from index
qn_classify_final.head()

Unnamed: 0,Category1
What does Brian do,DESCRIPTION
What time does he usually get up,NUMERIC
How does he usually go to work,DESCRIPTION
Why is he driving to work today,DESCRIPTION
What time does he arrive at work everyday,NUMERIC


#### Sentence Similiarity

In [27]:
nlp = spacy.load('en_core_web_lg')

In [28]:
doc = nlp(text)

In [29]:
text_df = pd.DataFrame([(X.text, X.label_) for X in doc.ents])
text_df.columns = ['Text', 'Category']
text_df.head()

Unnamed: 0,Text,Category
0,Brian,PERSON
1,6:00 o’clock,TIME
2,Today,DATE
3,6:30,TIME
4,today,DATE


#### Finding Solutions

In [30]:
def classify_qn(qn): 
    
    if type(qn) != list:
        qn = [qn]
    
    qn_vector = vect.transform(qn)
    
    if predict_cat0(qn_vector)[0] == 'DESCRIPTION':
        return 'sent'                                  ### Sentence

    elif predict_cat0(qn_vector)[0] == 'LOCATION':
        return 'location'                              ### Location

    elif predict_cat0(qn_vector)[0] == 'NUMERIC':
        return 'number'                                ### Cardinal, Date, Quantity, Time, Percent

    elif predict_cat0(qn_vector)[0] == 'ENTITY':       
        return 'entity'                                ### Sentence, Money 

    elif predict_cat0(qn_vector)[0] == 'HUMAN': 
        return 'human'                                 ### Sentence

    elif predict_cat0(qn_vector)[0] == 'ABBREVIATION': 
        return 'abb'                                   ### Sentence

    else:
        return 'no answer'

#### Train Test Split

In [31]:
qn1 = 'What time does he arrive at work everyday'

In [32]:
def ans(text, qn):
    
    similiar_sent,ent_list = [],[]
    
    sentence_list = text.split('.')[:-1]
    
    for sent in sentence_list:
        similiar_sent.append(nlp(qn).similarity(nlp(sent)))
        
#     for vals in sentence_list:
#         ent_list.append((X.text, X.label_) for X in nlp(vals).ents)
        
    similiar_index = sorted(range(len(similiar_sent)), key=lambda i: similiar_sent[i])[-3:]
    
    pred_class = classify_qn(qn)
    print(pred_class)
    
    if similiar_sent[similiar_index[-1]] > 0.85 :
    
        if pred_class == 'sent':

            if (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]


        elif pred_class == 'location':

            tup_list = [(X.text, X.label_) for X in nlp(sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]).ents]
            d = list(dict(tup_list).values())
            if d.count('GPE') == 1:
                for vals in tup:
                    if 'GPE' in vals:
                        return vals[0]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]

        elif pred_class == 'number':

            tup_list = [(X.text, X.label_) for X in nlp(sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]).ents]
            d = list(dict(tup_list).values())
            if (d.count('GPE') == 1) or (:
                for vals in tup:
                    if 'GPE' in vals:
                        return vals[0]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]
            if (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]

        elif pred_class == 'entity':

            if (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]

        elif pred_class == 'human':

            if (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]

        elif pred_class == 'abb':

            if (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-3]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]] + sentence_list[similiar_index[-3]]

            elif (similiar_sent[similiar_index[-1]] - similiar_sent[similiar_index[-2]]) < 0.0025:
                return sentence_list[similiar_index[-1]] + sentence_list[similiar_index[-2]]

            else:
                return sentence_list[similiar_index[-1]]

        else:
            return "Can't find any answer"
        
    else:
        return "Can't find any answer"
    

In [33]:
text = input('Enter Text: \n')

Enter Text: 
'Brian is a doctor. He looks after sick people. He usually gets up at 6:00 o’clock. Today he is late, it is 6:30 and he is still in bed. He usually goes to work by train but today he is driving to work. He arrives at work at 6:30 every morning but it is 7:30 now and he is still driving. It’s 12:00 o’clock now. He always has his lunch at 12:00 but today he isn’t having lunch at 12:00, he is looking after his sick patients. It is half past seven now, Brian is watching TV. He usually watches TV at half past seven because his favorite programme starts at half past seven. Brian has his dinner at 8.30 everyday and he is having dinner now. It is 12:00 now Brian is going to bed. He always goes to bed at 12:00.'


In [43]:
qn = input('Enter Question: \n')

Enter Question: 
What is another main form of precipitation besides drizzle, rain, snow, sleet and hail?


In [44]:
ans(text, qn)

entity


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


"Can't find any answer"

In [36]:
['What time does he usually get up?',
'Why is he driving to work today?',
'What is he doing at 12.00 today?']

['What time does he usually get up?',
 'Why is he driving to work today?',
 'What is he doing at 12.00 today?']

In [37]:
sentence_list = text.split('.')[:-1]

In [38]:
similiar_sent = []

for sent in sentence_list:
    similiar_sent.append(nlp(qn).similarity(nlp(sent)))

In [39]:
similiar_sent

[0.7182411446355524,
 0.847077614129077,
 0.7053331549259318,
 0.8503270895241182,
 0.9351232062193044,
 0.8484975478225513,
 0.547673488187769,
 0.853333372162675,
 0.8272913228247624,
 0.8158085941996177,
 0.6942150920886285,
 0.8470700640919865,
 0.8442778305482955,
 0.819141879258513]

In [64]:
ent_list = []
for vals in sentence_list:
    ent_list.append([(X.text, X.label_) for X in nlp(vals).ents])

In [40]:
similiar_index = sorted(range(len(similiar_sent)), key=lambda i: similiar_sent[i])[-3:]
similiar_index

[3, 7, 4]

In [41]:
0.9320104632079628 - 0.9277281225209243

0.004282340687038477

In [42]:
qn1

'What time does he arrive at work everyday'

In [148]:
similiar_sent = []
for sent in sentence_list:
    similiar_sent.append(nlp(qn1).similarity(nlp(sent)))

#### QA Dataset

In [52]:
qa_df = pd.read_csv('QA Dataset.csv')
del(qa_df['Unnamed: 0'])
qa_df.head()

Unnamed: 0,Text,Questions,Answers
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"['When did Beyonce start becoming popular?', '...","['in the late 1990s', 'singing and dancing', '..."
1,Frédéric François Chopin (/ˈʃoʊpæn/; French pr...,"[""What was Frédéric's nationalities?"", 'In wha...","['Polish and French', 'Romantic era', 'solo pi..."
2,The exact nature of relations between Tibet an...,['Who were Wang Jiawei and Nyima Gyaincain?'],['Mainland Chinese scholars']
3,The iPod is a line of portable media players a...,"['Which company produces the iPod?', 'When was...","['Apple', 'October 23, 2001', 'three', 'portab..."
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"['When did Beyonce start becoming popular?', '...","['in the late 1990s', 'singing and dancing', '..."


In [None]:
for i in range(len(qa_df)):
    qn = []
    
    for vals in qa_df.loc[i,'Questions']:
        qn.append(classify_qn(vals))
    
    qa_df.loc[i,'Questions'] = qn

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [None]:
qa_df.head()