In [326]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords

In [327]:
unprocessed_df = pd.read_pickle('../data/unprocessed_dataframe')

In [328]:
unprocessed_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [329]:
embeddings_dict = {}
with open('../data/glove.6B.50d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [330]:
#example vector embedding
v = embeddings_dict['king']
v.shape

(50,)

In [331]:
#lower casing
def lower_case(text):
    return text.lower()


#remove numbers
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

# remove punctuation
import string
def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

#remove stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stopwords(text):
    
    text_p = " ".join([word for word in text.split() if word not in stop_words])
    
    return text_p

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text.split()]
    text_p = " ".join(lemmatized)
    return text_p




[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [332]:
class Preprocessor():
    def __init__(self):
        pass
    
    def preprocess(self,df_column, steps):
        self.df_column = df_column
        self.steps = steps
    
        if 'lower_case' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lower_case(x))
        
        if 'remove_numbers' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_numbers(x))
        
        if 'remove_punctuation' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_punctuation(x))           
         
        if 'stopwords' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_stopwords(x))
            
        if 'lemmatize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lemmatize(x))    
                
            
        
        return self.df_column
    

In [342]:
df = unprocessed_df.copy()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [343]:
steps = ['lower_case','remove_numbers','remove_punctuation','lemmatize']

processor = Preprocessor()
df['question1'] = processor.preprocess(df['question1'] ,steps)
df['question2'] = processor.preprocess(df['question2'] ,steps)

In [344]:
df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,how be the life of a math student could you de...,which level of prepration be enough for the ex...,0.0
1,402555,536040,536041.0,how do i control my horny emotion,how do you control your horniness,1.0


In [345]:
#create dataframe of top 100 queries that have label = 1
queries = df[df['is_duplicate'] == 1][:100]
queries = queries[['qid1','question1','qid2','question2', 'is_duplicate']]
queries = queries.reset_index(drop=True)
queries.head(2)

Unnamed: 0,qid1,question1,qid2,question2,is_duplicate
0,536040,how do i control my horny emotion,536041.0,how do you control your horniness,1.0
1,155721,what can one do after mbbs,7256.0,what do i do after my mbbs,1.0


In [346]:
class GloveVectorizer():
    def __init__(self):
        pass
        
    def transform(self,embeddings,data):
        self.embeddings = embeddings
        self.data = data
        #get dimensionality of word embeddings
        v = self.embeddings['king']
        self.D = v.shape[0]
        
        X = np.zeros(shape = (len(data),self.D))
        
        for i in range(len(data)):
            sentence = data[i]
            words = sentence.split()
            vectors = []
            for word in words:
                try:
                    v = self.embeddings[word]
                    vectors.append(v)
                except KeyError:
                    pass
            vectors = np.array(vectors)
            if len(vectors) >0:
                sentence_embedding = np.mean(vectors, axis=0).reshape(1,-1)
                X[i,:] = sentence_embedding
            
        return X    
                    

In [347]:
corpus = df[['qid2','question2']]
corpus = corpus.drop_duplicates('qid2').reset_index(drop=True)
len(corpus)

273121

In [348]:
gv = GloveVectorizer()
X_train = gv.transform(embeddings_dict,corpus['question2'])
X_test = gv.transform(embeddings_dict,queries['question1'])

In [349]:
#calculate cosine similarity between each query and all responses. 
#Rank responses in terms of cosine similarity

def cos_sim(vector1,vector2):
    cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))
    return cosine_similarity

def retreive_unprocessed_question(question_qid, question_type):
    if question_type == 'query':
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid1']==question_qid, 'question1'].iloc[0]
    else:
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid2']==question_qid, 'question2'].iloc[0]
    
    return unprocessed_question


def get_matched_responses(query_index, number_of_responses, print_results = False):
    
    query_qid = queries.loc[query_index,'qid1']
    actual_response_id = queries.loc[query_index,'qid2']
    
    similarities = []
    for r in range(len(X_train)):
        cosine_sim = cos_sim(X_test[query_index,:],X_train[r,:])
        similarities.append(cosine_sim)
    similarities = np.array(similarities)
    similarities = np.nan_to_num(similarities,nan=-9999) #replace nans with large negative number
    sorted_sim = similarities.argsort()[::-1][:number_of_responses]
    
    matched_responses = []
    for corpus_id in sorted_sim:
        response_qid2 = corpus.loc[corpus_id,'qid2']
        matched_responses.append(response_qid2)
        
        
        
    if print_results:
        print('Query')
        print(retreive_unprocessed_question(query_qid,'query'))
        print()
        print('Ground truth (actual) response')
        print(retreive_unprocessed_question(actual_response_id,'response'))
        print()
        print('Top {} matching responses'.format(number_of_responses))
        for response in matched_responses:
            print(retreive_unprocessed_question(response,'response'))
    else:        
        
        return matched_responses 



In [370]:
get_matched_responses(5,5,True)

  cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))


Query
What are the coolest Android hacks and tricks you know?

Ground truth (actual) response
What are some cool hacks for Android phones?

Top 5 matching responses
What's the Android game you want to be hacked most?
How do I hack android games? I have rooted device!
How do I hack online games when my device is rooted?
How do I make a pc game?
How can I make a gaming app?


In [362]:
top5 = []
for query_index in range(len(queries)):
    actual_response_id = queries.loc[query_index,'qid2']
    top5_ranked = get_matched_responses(query_index,5,False)     
    if actual_response_id in top5_ranked:
        top5.append(1.0)
    else:
        top5.append(0.0)
        
        

  cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))


In [363]:
len(top5)

100

In [369]:
from sklearn.metrics import accuracy_score
accuracy_score(queries['is_duplicate'],top5)

0.36