## Imports

In [37]:
import pandas as pd
import os
import json
import re
import spacy
import nmslib

In [34]:
!pip install nmslib

Collecting nmslib
[?25l  Downloading https://files.pythonhosted.org/packages/e1/95/1f7c90d682b79398c5ee3f9296be8d2640fa41de24226bcf5473c801ada6/nmslib-1.7.3.6.tar.gz (255kB)
[K    100% |████████████████████████████████| 256kB 11.8MB/s ta 0:00:01
[?25hCollecting pybind11>=2.0 (from nmslib)
[?25l  Downloading https://files.pythonhosted.org/packages/f2/7c/e71995e59e108799800cb0fce6c4b4927914d7eada0723dd20bae3b51786/pybind11-2.2.4-py2.py3-none-any.whl (145kB)
[K    100% |████████████████████████████████| 153kB 42.3MB/s ta 0:00:01
Building wheels for collected packages: nmslib
  Running setup.py bdist_wheel for nmslib ... [?25lerror
  Complete output from command /home/onepanel/.conda/bin/python -u -c "import setuptools, tokenize;__file__='/tmp/pip-install-aizvpy22/nmslib/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/pip-wheel-pq7px92p --python-tag cp36:
  running bdist_whe

In [22]:
!python -m spacy download en

Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 87.5MB/s ta 0:00:01
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.0.0
[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /home/onepanel/.conda/lib/python3.6/site-packages/en_core_web_sm -->
    /home/onepanel/.conda/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [2]:
!pip install xlrd

[31mkaggle 1.5.0 has requirement urllib3<1.23.0,>=1.15, but you'll have urllib3 1.24.1 which is incompatible.[0m
[31mfastai 1.0.18 has requirement notebook>=5.7.0, but you'll have notebook 5.6.0 which is incompatible.[0m
[31mboto3 1.9.125 has requirement botocore<1.13.0,>=1.12.125, but you'll have botocore 1.12.93 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Read xslx file sheet

In [16]:
data = pd.read_excel('../data/QA_(0-100).xlsx','Sheet1',  index_col=0, header=None)

In [17]:
data = data.iloc[:, 0:2].reset_index(drop=True)

In [19]:
data.columns = ['question', 'answer']

In [23]:
data.head(2)

Unnamed: 0,question,answer
0,Carl and the Passions changed band name to what,Beach Boys
1,How many rings on the Olympic flag,Five


# Create QA model class

In [70]:
class QA(object):
    def __init__(self, data):
        self.nlp = spacy.load('en')
        self.questions = data.question.tolist()
        self.answers = data.answer.tolist()
    
    def to_vectors(self, texts):
        """Convert texts into their vectors"""
        result = []
        for item in texts:
            result.append(self.nlp(item).vector)
        
        return result
            
    def build_nmslib_index(self):
        """build nmslib index with vectors of question texts"""
        self.index = {}
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.to_vectors(self.questions))
        self.index.createIndex({'post': 2}, print_progress=True)
        
    def search(self, text, max_distance=0.2):
        """
        K-Nearest-Neighbour search over indexed taxonomy data and distance threshold parameter 
        to get most similar one. 
        Args:
            text: (str) sample question text
            max_distance: (float) maximum allowed distance for neighbours

        Returns:
            result: (tuple) index and distance for found item

        """
        result = {}
        vector = self.nlp(text).vector
        
        if vector is not None:
            ids, distances = self.index.knnQuery(vector)
            
            if ids is not None and distances is not None:
                best_indices_mask = (distances == distances.min()) & (distances < max_distance)
                if best_indices_mask.sum() != 0:
                    result = {'index': ids[best_indices_mask][0], 'distance': distances[best_indices_mask][0]}

        return result
    
    def query(self, question, max_distance=0.2):
        search_result = self.search(question, max_distance)
        index, distance = search_result.get('index', -1), search_result.get('distance', -1)
        result = "N/A"
        if index != -1:
            result = self.answers[index]
        
        return result

In [71]:
qa = QA(data)
qa.build_nmslib_index()

In [85]:
qa.query('Carl and the Passions day changed band name to what', max_distance=0.05)

'Beach Boys'

In [58]:
data.head(2)

Unnamed: 0,question,answer
0,Carl and the Passions changed band name to what,Beach Boys
1,How many rings on the Olympic flag,Five


In [59]:
preds = data.question.apply(lambda x: qa.query(x))