In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = np.loadtxt('TrainData.txt', delimiter='\t', dtype=str)

In [3]:
df = pd.DataFrame(data)
df.columns = df.iloc[0]

In [4]:
df = df[1:]
df.head()

Unnamed: 0,Answer,Question
1,Yes. It's absolutely beautiful today.,The weather is great isn't it?
2,"yes, i like that one, too.",that one. the one that's all black.
3,it's really nice.,i got it from macy's.
4,at 8:00 p.m.,when does it start?
5,"nothing, except my favorite color is blue.",what's the matter with green eyes?


In [5]:
test_data = pd.read_csv('TestData.csv')

In [6]:
test_data.head()

Unnamed: 0,I.D.,Question
0,QN_1,"i'll give you a speech like that, too."
1,QN_2,"i know, you're absolutely right."
2,QN_3,i liked it.
3,QN_4,the baby was eight pounds six ounces.
4,QN_5,I was sold a wireless service unavailable in m...


# Preprocessing

In [7]:
import re
import nltk
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from nltk.stem.lancaster import LancasterStemmer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
#st = LancasterStemmer()
def clean_sentence(sentence,stopwords = False,remove_digits = False):
    sentence = sentence.lower().strip()
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    sentence = re.sub(pattern,"",sentence)
    
    if stopwords:
        senetnce = remove_stopwords(sentence)
        lemmas = WordNetLemmatizer()
        meaningful_words = [lemmas.lemmatize(w) for w in sentence.split()]
        sentence = " ".join(meaningful_words)
    else:
        lemmas = WordNetLemmatizer()
        meaningful_words = [lemmas.lemmatize(w) for w in sentence.split()]
        sentence = " ".join(meaningful_words)
    return sentence

In [9]:
def get_cleaned_sentences(df,stopwords = False):
    cleaned_sentences = []
    for i,j in df.iterrows():
        cleaned = clean_sentence(j['Question'],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;

In [10]:
cleaned_sentences = get_cleaned_sentences(df,stopwords = True)
print(cleaned_sentences)

['the weather is great isnt it', 'that one the one thats all black', 'i got it from macys', 'when doe it start', 'whats the matter with green eye', 'have you ever read a book', 'what is bioinformatics', 'can i try it on', 'which account are you making this withdrawal from', 'thats the truth', 'what is basketball', 'we can watch my dvd', 'i saw dad wipe his nose on his sleeve yesterday', 'i can cancel it for you right now', 'im sure everything will be okay in a day or two', 'ive actually been busy lately', 'i wish i wa free that night im kind of mad that i didnt go', 'how are you doing today', 'im going to the movie with a friend how about you', 'what are they', 'do you drink', 'the price of stamp go up and up', 'the forecast say that it will be warm on the weekend', 'how long is this bus ride', 'how wa the movie', 'i mixed them together', 'oh really maybe you should have called 911', 'well get warmer a we walk', 'i never used to buy swiss cheese', 'maybe im not sure', 'i lost my new pe

In [11]:
cleaned_sentences_with_stopwords = get_cleaned_sentences(df,stopwords = False)
print(cleaned_sentences_with_stopwords)

['the weather is great isnt it', 'that one the one thats all black', 'i got it from macys', 'when doe it start', 'whats the matter with green eye', 'have you ever read a book', 'what is bioinformatics', 'can i try it on', 'which account are you making this withdrawal from', 'thats the truth', 'what is basketball', 'we can watch my dvd', 'i saw dad wipe his nose on his sleeve yesterday', 'i can cancel it for you right now', 'im sure everything will be okay in a day or two', 'ive actually been busy lately', 'i wish i wa free that night im kind of mad that i didnt go', 'how are you doing today', 'im going to the movie with a friend how about you', 'what are they', 'do you drink', 'the price of stamp go up and up', 'the forecast say that it will be warm on the weekend', 'how long is this bus ride', 'how wa the movie', 'i mixed them together', 'oh really maybe you should have called 911', 'well get warmer a we walk', 'i never used to buy swiss cheese', 'maybe im not sure', 'i lost my new pe

In [12]:
sentences = cleaned_sentences_with_stopwords

words = [[word for word in documents.split()] for documents in sentences]

from gensim import corpora

dictionary = corpora.Dictionary(words)

for key, value in dictionary.items():
  print(key,' : ',value)

import pprint
bow_corpus = [dictionary.doc2bow(text) for text in words]
for sent,embedding in zip(sentences,bow_corpus):
  print(sent)
  print(embedding)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ive been pretty busy myself
[(68, 1), (69, 1), (70, 1), (160, 1), (238, 1)]
can they take their belonging
[(31, 1), (89, 1), (147, 1), (399, 1), (2149, 1)]
why doesnt it close
[(3, 1), (148, 1), (234, 1), (494, 1)]
i dont know
[(13, 1), (173, 1), (174, 1)]
did you see that car
[(9, 1), (28, 1), (134, 1), (222, 1), (329, 1)]
i toss and turn all night
[(6, 1), (13, 1), (77, 1), (92, 1), (786, 1), (2150, 1)]
cat have beautiful eye
[(18, 1), (26, 1), (809, 1), (1812, 1)]
no he said he gently placed the boy on the street
[(4, 2), (32, 1), (128, 1), (149, 2), (472, 1), (604, 1), (900, 1), (1262, 1), (2151, 1)]
dont be ridiculous
[(57, 1), (173, 1), (1817, 1)]
sound like it wa a close game
[(3, 1), (23, 1), (79, 1), (168, 1), (205, 1), (234, 1), (560, 1)]
yes thats why it also the best hamburger in town
[(3, 1), (4, 1), (10, 1), (61, 1), (148, 1), (272, 1), (390, 1), (464, 1), (835, 1), (1965, 1)]
didnt you laugh through the who

In [13]:
ques = "What is your favourite anime?"
question = clean_sentence(ques,stopwords = False)
question_embed = dictionary.doc2bow(question.split())

print("\n\n",question,"\n",question_embed)



 what is your favourite anime 
 [(1, 1), (30, 1), (166, 1), (992, 1)]


In [14]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
answer = []
retreieved = []
def RetreiveAndPrintAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
  max_sim = -1
  index_sim = -1
  for index,faq_embedding in enumerate(sentence_embeddings):
    sim = cosine_similarity(faq_embedding,question_embedding)[0][0]
    if sim>max_sim:
      max_sim = sim
      index_sim = index

  answer.append(FAQdf.iloc[index_sim,0])
  retreieved.append(FAQdf.iloc[index_sim,1])


In [16]:
test_questions = test_data['Question']

In [None]:
count = 0
for i in test_questions:
  question = clean_sentence(i,stopwords = False)
  question_embed = dictionary.doc2bow(question.split())
  RetreiveAndPrintAnswer(question_embed,bow_corpus,df,sentences)
  count = count+1

In [None]:
len(test_data),len(answer)

(543, 543)

In [None]:
first_submission = test_data.copy()

In [None]:
first_submission['Answer'] = answer

In [None]:
first_submission = first_submission.drop(['Question'],axis = 1)

In [None]:
first_submission

Unnamed: 0,I.D.,Answer
0,QN_1,Yes.Take advantage of it.
1,QN_2,i will talk to your father about that.
2,QN_3,it's really nice.
3,QN_4,"yes, i like that one, too."
4,QN_5,i will talk to your father about that.
...,...,...
538,QN_539,"yes, i like that one, too."
539,QN_540,I once knew a program who could dream. I don...
540,QN_541,are you going to buy a desktop or laptop?
541,QN_542,I am fine


In [None]:
first_submission.set_index(['I.D.'],inplace=True)

In [None]:
first_submission.to_csv('Abhishek_K_1.csv') 

In [None]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer}).head()

Unnamed: 0,Question,Question Retreived from Corpus,Answer
0,"i'll give you a speech like that, too.",Yes.Take advantage of it.,Yes.Take advantage of it.
1,"i know, you're absolutely right.",i will talk to your father about that.,i will talk to your father about that.
2,i liked it.,it's really nice.,it's really nice.
3,the baby was eight pounds six ounces.,"yes, i like that one, too.","yes, i like that one, too."
4,I was sold a wireless service unavailable in m...,i will talk to your father about that.,i will talk to your father about that.


Not a good approach

# Glove Embedding

In [None]:
from gensim.models import word2vec
import gensim.downloader as api

In [None]:
glove_model = None
try:
  glove_model = gensim.models.keyedVectors.load("./glovemodel.mod")
  print("Loaded glove model")
except:
  glove_model = api.load('glove-twitter-25')
  glove_model.save("./glovemodel.mod")
  print("Saved glove model")


Saved glove model


In [None]:
v2w_model = None
try:
  v2w_model = gensim.model.KeyedVectors.load("./w2vecmodel.mod")
  print("Loaded word2vec model")
except:
  v2w_model = api.load('word2vec-google-news-300')
  v2w_model.save("./w2vecmodel.mod")
  print("Saved word2vec model")


Saved word2vec model


In [None]:
w2vec_embedding_size = len(v2w_model['computer'])
glove_embedding_size = len(glove_model['computer'])
w2vec_embedding_size,glove_embedding_size

(300, 25)

In [None]:
def getWordvec(word,model):
  samp = model['computer']
  vec = [0]*len(samp)
  try:
    vec = model[word]
  except:
    vec = [0]*len(samp)
  return (vec)

In [None]:
def getPhraseEmbedding(phrase,embeddingmodel):
  samp = getWordvec('computer',embeddingmodel)
  vec = np.array([0]*len(samp))
  den = 0
  for word in phrase.split():
    den = den+1
    vec = vec+np.array(getWordvec(word,embeddingmodel))
    return vec.reshape(1,-1)

In [None]:
sent_embeddings = []
for sent in cleaned_sentences:
  sent_embeddings.append(getPhraseEmbedding(sent,v2w_model))

In [None]:
answer = []
retreieved = []

In [None]:
for i in test_questions:
  question = clean_sentence(i,stopwords = True)
  question_embed = getPhraseEmbedding(question,v2w_model)
  RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences)

In [None]:
len(answer),len(test_data),len(retreieved)

(543, 543, 543)

In [None]:
second_submission = test_data.copy()

In [None]:
second_submission['Answer'] = answer

In [None]:
second_submission.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",Bye! I'll be back.
1,QN_2,"i know, you're absolutely right.",it's really nice.
2,QN_3,i liked it.,it's really nice.
3,QN_4,the baby was eight pounds six ounces.,Yes. It's absolutely beautiful today.
4,QN_5,I was sold a wireless service unavailable in m...,it's really nice.


In [None]:
second_submission = second_submission.drop(['Question'],axis = 1)

In [None]:
second_submission.set_index(['I.D.'],inplace=True)

In [None]:
second_submission.to_csv('Abhishek_K_4.csv')

In [None]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer}).tail()

Unnamed: 0,Question,Question Retreived from Corpus,Answer
538,at night i went out to eat. the food was delic...,at 8:00 p.m.,i'll be there.
539,TELL ME ABOUT BASEBALL,Tell me about your dreams,I once knew a program who could dream. I don...
540,"you'll remember to do it, but you won't have t...","you'll remember to do it, but you won't have t...","okay, i'll take it out front right now."
541,you are not smart,you have no idea how happy i am for you.,for real?
542,what do you mean?,what is bioinformatics,a fancy name for applied computer science in ...


Somewhat OK

In [None]:
sent_embeddings = []
for sent in cleaned_sentences_with_stopwords:
  sent_embeddings.append(getPhraseEmbedding(sent,glove_model))

In [None]:
answer = []
retreieved = []

In [None]:
for i in test_questions:
  question = clean_sentence(i,stopwords = True)
  question_embed = getPhraseEmbedding(question,glove_model)
  RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences)

In [None]:
len(answer),len(test_data),len(retreieved)

(543, 543, 543)

In [None]:
second_submission1 = test_data.copy()

In [None]:
second_submission1['Answer'] = answer

In [None]:
second_submission1.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",Bye! I'll be back.
1,QN_2,"i know, you're absolutely right.",it's really nice.
2,QN_3,i liked it.,it's really nice.
3,QN_4,the baby was eight pounds six ounces.,Yes. It's absolutely beautiful today.
4,QN_5,I was sold a wireless service unavailable in m...,it's really nice.


In [None]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer}).tail()

Unnamed: 0,Question,Question Retreived from Corpus,Answer
538,at night i went out to eat. the food was delic...,at 8:00 p.m.,i'll be there.
539,TELL ME ABOUT BASEBALL,Tell me about your dreams,I once knew a program who could dream. I don...
540,"you'll remember to do it, but you won't have t...","you'll remember to do it, but you won't have t...","okay, i'll take it out front right now."
541,you are not smart,you have no idea how happy i am for you.,for real?
542,what do you mean?,what is bioinformatics,a fancy name for applied computer science in ...


word2vec with stopwords

In [None]:
sent_embeddings = []
for sent in cleaned_sentences_with_stopwords:
  sent_embeddings.append(getPhraseEmbedding(sent,v2w_model))

In [None]:
answer = []
retreieved = []

In [None]:
for i in test_questions:
  question = clean_sentence(i,stopwords = True)
  question_embed = getPhraseEmbedding(question,v2w_model)
  RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences)

In [None]:
len(answer),len(test_data),len(retreieved)

(543, 543, 543)

In [None]:
Third_submission = test_data.copy()

In [None]:
Third_submission['Answer'] = answer

In [None]:
Third_submission.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",Bye! I'll be back.
1,QN_2,"i know, you're absolutely right.",it's really nice.
2,QN_3,i liked it.,it's really nice.
3,QN_4,the baby was eight pounds six ounces.,Yes. It's absolutely beautiful today.
4,QN_5,I was sold a wireless service unavailable in m...,it's really nice.


In [None]:
Third_submission = Third_submission.drop(['Question'],axis = 1)

In [None]:
Third_submission.set_index(['I.D.'],inplace=True)

In [None]:
Third_submission.to_csv('Abhishek_K_5.csv')

In [None]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer}).tail()

Unnamed: 0,Question,Question Retreived from Corpus,Answer
538,at night i went out to eat. the food was delic...,at 8:00 p.m.,i'll be there.
539,TELL ME ABOUT BASEBALL,Tell me about your dreams,I once knew a program who could dream. I don...
540,"you'll remember to do it, but you won't have t...","you'll remember to do it, but you won't have t...","okay, i'll take it out front right now."
541,you are not smart,you have no idea how happy i am for you.,for real?
542,what do you mean?,what is bioinformatics,a fancy name for applied computer science in ...


# Pre Trained Models

In [76]:
pip install -U sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [None]:
answer = []
retreieved = []
def RetreiveAndPrintAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
  max_sim = -1
  index_sim = -1
  for index,faq_embedding in enumerate(sentence_embeddings):
    sim = cosine_similarity(faq_embedding,question_embedding)
    
    if sim>max_sim:
      max_sim = sim
      index_sim = index

  answer.append(FAQdf.iloc[index_sim,0])
  retreieved.append(FAQdf.iloc[index_sim,1])


In [None]:
sent_embeddings=[]
for sent in cleaned_sentences_with_stopwords:
    sent_embed = model.encode(sent)
    sent_embed = np.expand_dims(sent_embed, axis = 0)
    sent_embeddings.append(sent_embed)
    

In [None]:
for i in test_questions:
    question = clean_sentence(i,stopwords = False)
    question_embed = model.encode(question)
    question_embed = np.expand_dims(question_embed, axis = 0)
    RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences)
   

In [None]:
len(answer)

543

In [None]:
fourth_submission = test_data.copy()

In [None]:
fourth_submission['Answer'] = answer

In [None]:
fourth_submission.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,QN_2,"i know, you're absolutely right.",i wish it would cool off one day.
2,QN_3,i liked it.,"i'll give you a speech like that, too."
3,QN_4,the baby was eight pounds six ounces.,that's good to hear.
4,QN_5,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."


In [None]:
fourth_submission = fourth_submission.drop(['Question'],axis=1)

In [None]:
fourth_submission.set_index(['I.D.'],inplace = True)

In [None]:
fourth_submission.to_csv('Abhishek_K_4.csv')

In [None]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer})

Unnamed: 0,Question,Question Retreived from Corpus,Answer
0,"i'll give you a speech like that, too.","i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,"i know, you're absolutely right.","i know, you're absolutely right.",i wish it would cool off one day.
2,i liked it.,i liked it.,"i'll give you a speech like that, too."
3,the baby was eight pounds six ounces.,the baby was 8 pounds 6 ounces.,that's good to hear.
4,I was sold a wireless service unavailable in m...,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."
...,...,...,...
538,at night i went out to eat. the food was delic...,at night i went out to eat. the food was delic...,people who live in hawaii are lucky.
539,TELL ME ABOUT BASEBALL,TELL ME ABOUT BASEBALL,What is Baseball
540,"you'll remember to do it, but you won't have t...","you'll remember to do it, but you won't have t...","okay, i'll take it out front right now."
541,you are not smart,you are not smart,you may be right.


In [None]:
model = SentenceTransformer('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
answer = []
retreieved = []
def RetreiveAndPrintAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
  max_sim = -1
  index_sim = -1
  for index,faq_embedding in enumerate(sentence_embeddings):
    sim = cosine_similarity(faq_embedding,question_embedding)
    
    if sim>max_sim:
      max_sim = sim
      index_sim = index

  answer.append(FAQdf.iloc[index_sim,0])
  retreieved.append(FAQdf.iloc[index_sim,1])


In [None]:
sent_embeddings=[]
for sent in cleaned_sentences_with_stopwords:
    sent_embed = model.encode(sent)
    sent_embed = np.expand_dims(sent_embed, axis = 0)
    sent_embeddings.append(sent_embed)
    

In [None]:
for i in test_questions:
    question = clean_sentence(i,stopwords = False)
    question_embed = model.encode(question)
    question_embed = np.expand_dims(question_embed, axis = 0)
    RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences)
   

In [None]:
len(answer)

543

In [None]:
fifth_submission = test_data.copy()

In [None]:
fifth_submission['Answer'] = answer

In [None]:
fifth_submission.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,QN_2,"i know, you're absolutely right.",i wish it would cool off one day.
2,QN_3,i liked it.,"i'll give you a speech like that, too."
3,QN_4,the baby was eight pounds six ounces.,that's good to hear.
4,QN_5,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."


In [None]:
fifth_submission = fifth_submission.drop(['Question'],axis=1)

In [None]:
fifth_submission.set_index(['I.D.'],inplace = True)

In [None]:
fifth_submission.to_csv('Abhishek_K_5.csv')

In [None]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer})

Unnamed: 0,Question,Question Retreived from Corpus,Answer
0,"i'll give you a speech like that, too.","i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,"i know, you're absolutely right.","i know, you're absolutely right.",i wish it would cool off one day.
2,i liked it.,i liked it.,"i'll give you a speech like that, too."
3,the baby was eight pounds six ounces.,the baby was 8 pounds 6 ounces.,that's good to hear.
4,I was sold a wireless service unavailable in m...,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."
...,...,...,...
538,at night i went out to eat. the food was delic...,at night i went out to eat. the food was delic...,people who live in hawaii are lucky.
539,TELL ME ABOUT BASEBALL,TELL ME ABOUT BASEBALL,What is Baseball
540,"you'll remember to do it, but you won't have t...","you'll remember to do it, but you won't have t...","okay, i'll take it out front right now."
541,you are not smart,you are not smart,you may be right.


# Preprocessing Text again

In [18]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.0.58-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 5.2 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 32.1 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85453 sha256=b5816dc15bdf2a154585b6194ff4226f854fd42a585cf83cd4689740113ab898
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully install

In [67]:
import contractions

In [68]:
#st = LancasterStemmer()
def clean_sentence(sentence,stopwords = False,remove_digits = False):
    words = [contractions.fix(word) for word in sentence.split()]
    sentence = " ".join(words)
    sentence = sentence.lower().strip()
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    sentence = re.sub(pattern,"",sentence)
    
    

    if stopwords:
        senetnce = remove_stopwords(sentence)
        lemmas = WordNetLemmatizer()
        meaningful_words = [lemmas.lemmatize(w) for w in sentence.split()]
        sentence = " ".join(meaningful_words)
    else:
        lemmas = WordNetLemmatizer()
        meaningful_words = [lemmas.lemmatize(w) for w in sentence.split()]
        sentence = " ".join(meaningful_words)
    return sentence

In [69]:
def get_cleaned_sentences(df,stopwords = False):
    cleaned_sentences = []
    for i,j in df.iterrows():
        cleaned = clean_sentence(j['Question'],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;

In [70]:
cleaned_sentences = get_cleaned_sentences(df,stopwords = True)
print(cleaned_sentences)

['the weather is great is not it', 'that one the one that is all black', 'i got it from macys', 'when doe it start', 'what is the matter with green eye', 'have you ever read a book', 'what is bioinformatics', 'can i try it on', 'which account are you making this withdrawal from', 'that is the truth', 'what is basketball', 'we can watch my dvd', 'i saw dad wipe his nose on his sleeve yesterday', 'i can cancel it for you right now', 'i am sure everything will be okay in a day or two', 'i have actually been busy lately', 'i wish i wa free that night i am kind of mad that i did not go', 'how are you doing today', 'i am going to the movie with a friend how about you', 'what are they', 'do you drink', 'the price of stamp go up and up', 'the forecast say that it will be warm on the weekend', 'how long is this bus ride', 'how wa the movie', 'i mixed them together', 'oh really maybe you should have called 911', 'we will get warmer a we walk', 'i never used to buy swiss cheese', 'maybe i am not 

In [71]:
cleaned_sentences_with_stopwords = get_cleaned_sentences(df,stopwords = False)
print(cleaned_sentences)

['the weather is great is not it', 'that one the one that is all black', 'i got it from macys', 'when doe it start', 'what is the matter with green eye', 'have you ever read a book', 'what is bioinformatics', 'can i try it on', 'which account are you making this withdrawal from', 'that is the truth', 'what is basketball', 'we can watch my dvd', 'i saw dad wipe his nose on his sleeve yesterday', 'i can cancel it for you right now', 'i am sure everything will be okay in a day or two', 'i have actually been busy lately', 'i wish i wa free that night i am kind of mad that i did not go', 'how are you doing today', 'i am going to the movie with a friend how about you', 'what are they', 'do you drink', 'the price of stamp go up and up', 'the forecast say that it will be warm on the weekend', 'how long is this bus ride', 'how wa the movie', 'i mixed them together', 'oh really maybe you should have called 911', 'we will get warmer a we walk', 'i never used to buy swiss cheese', 'maybe i am not 

In [72]:
pd.DataFrame({"Cleaned Sentences":cleaned_sentences,"Cleaned Sentences with Stopwords":cleaned_sentences_with_stopwords})

Unnamed: 0,Cleaned Sentences,Cleaned Sentences with Stopwords
0,the weather is great is not it,the weather is great is not it
1,that one the one that is all black,that one the one that is all black
2,i got it from macys,i got it from macys
3,when doe it start,when doe it start
4,what is the matter with green eye,what is the matter with green eye
...,...,...
5490,at night i went out to eat the food wa delicious,at night i went out to eat the food wa delicious
5491,tell me about baseball,tell me about baseball
5492,you will remember to do it but you will not ha...,you will remember to do it but you will not ha...
5493,you are not smart,you are not smart


In [77]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [78]:
answer = []
retreieved = []

In [79]:
sent_embeddings=[]
for sent in cleaned_sentences_with_stopwords:
    sent_embed = model.encode(sent)
    sent_embed = np.expand_dims(sent_embed, axis = 0)
    sent_embeddings.append(sent_embed)
    

In [80]:
for i in test_questions:
    question = clean_sentence(i,stopwords = False)
    question_embed = model.encode(question)
    question_embed = np.expand_dims(question_embed, axis = 0)
    RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences_with_stopwords)
   

In [81]:
seventh_submission = test_data.copy()

In [82]:
seventh_submission['Answer'] = answer

In [83]:
seventh_submission.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,QN_2,"i know, you're absolutely right.",i wish it would cool off one day.
2,QN_3,i liked it.,"i'll give you a speech like that, too."
3,QN_4,the baby was eight pounds six ounces.,that's good to hear.
4,QN_5,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."


In [89]:
pd.options.display.max_columns=None
pd.options.display.max_rows=None

In [90]:
pd.DataFrame({"Question":test_data['Question'],"Question Retreived from Corpus":retreieved,"Answer":answer})

Unnamed: 0,Question,Question Retreived from Corpus,Answer
0,"i'll give you a speech like that, too.","i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,"i know, you're absolutely right.","i know, you're absolutely right.",i wish it would cool off one day.
2,i liked it.,i liked it.,"i'll give you a speech like that, too."
3,the baby was eight pounds six ounces.,the baby was 8 pounds 6 ounces.,that's good to hear.
4,I was sold a wireless service unavailable in m...,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."
5,maybe four or five pounds?,maybe four or five pounds?,my waist is bigger than it was.
6,do you know hal,do you know hal,hal is the famous artificial intelligence fro...
7,"Yeah,actually a lot of them are.","Yeah,actually a lot of them are.",How does she act?
8,"if it's old age, why don't both of your hands ...","if it's old age, why don't both of your hands ...",that's a good question. maybe it's not old age.
9,it's supposed to start at about eight.,it's supposed to start at about eight.,how many invitations has she given out?


In [85]:
seventh_submission = seventh_submission.drop(['Question'],axis=1)

In [86]:
seventh_submission.set_index(['I.D.'],inplace = True)

In [87]:
seventh_submission.head()

Unnamed: 0_level_0,Answer
I.D.,Unnamed: 1_level_1
QN_1,do you think anyone will come to my funeral?
QN_2,i wish it would cool off one day.
QN_3,"i'll give you a speech like that, too."
QN_4,that's good to hear.
QN_5,"I see it here,we charged you $5 extra a month."


In [88]:
seventh_submission.to_csv('Abhishek_K_7.csv')

In [103]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v1')

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/591 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [104]:
answer = []
retreieved = []

In [105]:
sent_embeddings=[]
for sent in cleaned_sentences_with_stopwords:
    sent_embed = model.encode(sent)
    sent_embed = np.expand_dims(sent_embed, axis = 0)
    sent_embeddings.append(sent_embed)
    

In [106]:
for i in test_questions:
    question = clean_sentence(i,stopwords = False)
    question_embed = model.encode(question)
    question_embed = np.expand_dims(question_embed, axis = 0)
    RetreiveAndPrintAnswer(question_embed,sent_embeddings,df,cleaned_sentences_with_stopwords)
   

In [107]:
len(answer)

543

In [110]:
eighth_submission = test_data.copy()

In [111]:
eighth_submission['Answer'] = answer

In [112]:
eighth_submission.head()

Unnamed: 0,I.D.,Question,Answer
0,QN_1,"i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,QN_2,"i know, you're absolutely right.",i wish it would cool off one day.
2,QN_3,i liked it.,"i'll give you a speech like that, too."
3,QN_4,the baby was eight pounds six ounces.,that's good to hear.
4,QN_5,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."


In [113]:
eighth_submission = eighth_submission.drop(['Question'],axis = 1)

In [114]:
eighth_submission.set_index(['I.D.'],inplace=True)

In [115]:
eighth_submission.to_csv('Abhishek_K_8.csv')