In [10]:
## question similarity using word2vec(nepali model) and word mover distance

### Notebook Imports

In [1]:
from gensim.models import KeyedVectors



In [2]:
import pandas as pd
import numpy as np

### load model


In [3]:
model = KeyedVectors.load_word2vec_format('nepali_embeddings_word2vec.txt', binary=False)

In [4]:
doc = pd.read_csv('dataset_modified.txt',sep=',')
doc.head()


Unnamed: 0,Sentence,Final Intent
0,तिम्रो नाम के हो,परि
1,तिमिलाई के भनेर चिन्छन अरुले,परि
2,तिमिलाई बोलाउनि नाम के हो,परि
3,म तिमिलाई के भनेर बोलाउ,परि
4,तिमिलाई के भनेर चिन्छन,परि


In [5]:
intent_count=pd.DataFrame(doc['Final Intent'].value_counts())
intent_count['intent']=intent_count.index

In [6]:
intent_count.index = np.arange(len(intent_count))

intent_count.columns = ['Count','Intent']

In [7]:
intent_count = intent_count[['Intent','Count']]
intent_count.head()


Unnamed: 0,Intent,Count
0,हुक्का,18
1,मोमो,17
2,चाउमिन,17
3,पिजा,17
4,चिकेन ललिपप,16


#### saving the dataframe with count for further processing

In [8]:
# intent_count.to_csv('follow_up_qsn_v1.csv',index=None)

In [15]:
# checking word mover distance
model.wmdistance('म राम्रो छु','म ठिक छु')

0.8173072042066036

In [71]:
def most_similar(sentence, corpus,model):
    ''' 
    sentence: question to compare
    corpus: dataset containing all questions
    model: word2vec pretrained  model on nepali corpus
    returns most closest intent to the given sentence and its corresponding similarity measure 
    '''
    similar=[]
    sent_list = []
    for sent in corpus.values:
        sim = model.wmdistance(sentence,sent[0])
        similar.append(sim)
    most_sim_list= sorted(range(len(similar)), key=lambda x: similar[x])[:1]
    max_sim_value = max(similar)
    for i in most_sim_list:
        sent_list.append(corpus.iloc[i,1])
    return sent_list[0],max_sim_value

In [72]:
sent1='अफर छैन'
intent,max_sim=most_similar(sent1,doc,model)

In [73]:
intent

'अफर'

In [65]:
max_sim

1.264619942931464

### reading the dataset with respones

In [60]:
df_with_res = pd.read_csv('follow_up_qsn_v1_1.csv',sep=',')

In [64]:
df_with_res.tail()

Unnamed: 0,Intent,Count,response
44,फेन्टा,8,यहाँ फेन्टा पाइन्छ। हामी सँग जम्बो फेन्टा र सा...
45,स्प्राइट,8,यहाँ स्प्राइट पाइन्छ। हामी सँग जम्बो स्प्राइट ...
46,रेस्टुरेन्ट,7,यो रेस्टुरेन्टको नाम ग्रीन भ्याली हो र हजुर ला...
47,कम्पनि,7,मलाई पाइला टेक्नोलोजीले बनाएको हो | पाइला टेक्...
48,अन्य,0,सरि तपाइको प्रश्न यो रेस्टुरेन्ट सम्बन्धि हुनु...


In [74]:
def respond(df_with_res,intent,max_sim):
    '''df_with_res: dataframe that has response
        max_idx: intent_returned by the similarity function
        max_sim: measure of how similar the returned setence is with the given sentence
    '''
    if max_sim >0.7:
        res=df_with_res.loc[df_with_res['Intent'] == intent]['response']
    else:
        res= df_with_res.loc[df_with_res['Intent'] == 'अन्य']['response']
    return res.values[0]

In [75]:
respond(df_with_res=df_with_res, intent=intent, max_sim=max_sim)

'अफर भनेर त केहि छैन तर थोरै डिस्काउन्ट हुन्छ |'

### database connectivity

In [32]:
import os
import pymysql

In [63]:
host = os.getenv('MYSQL_HOST')
port = os.getenv('MYSQL_PORT')
user = os.getenv('MYSQL_USER')
password = os.getenv('MYSQL_PASSWORD')
database = os.getenv('MYSQL_DATABASE')

conn = pymysql.connect(
    host=host,
    port=int(3306),
    user="root",
    passwd=password,
    db="NLP",
    charset='utf8mb4')

In [64]:
# conn.close()

In [80]:
df = pd.read_sql_query("SELECT response FROM follow_up_qsn_v2 WHERE Intent='{}'".format(max_idx[0]),
    conn)
df.iloc[0,0]

'अफर भनेर त केहि छैन तर थोरै डिस्काउन्ट हुन्छ |'