In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

### Data wrangling

In [59]:
df = pd.read_csv('../data/data.tsv', sep='\t',error_bad_lines=False)

b'Skipping line 83032: expected 6 fields, saw 7\n'
b'Skipping line 154657: expected 6 fields, saw 7\n'
b'Skipping line 323916: expected 6 fields, saw 7\n'
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [60]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [61]:
#remove rows with missing entries in queries, responses or labels
df = df.dropna(axis=0, subset=('question1','question2','is_duplicate' ))

### Text preprocessing

In [62]:
#lower casing
def lower_case(text):
    return text.lower()


#remove numbers
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

# remove punctuation
import string
def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

#tokenize text

import nltk
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words

#remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stopwords(text):
    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words




[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
class Preprocessor():
    def __init__(self):
        pass
    
    def preprocess(self,df_column, steps):
        self.df_column = df_column
        self.steps = steps
    
        if 'lower_case' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lower_case(x))
        
        if 'remove_numbers' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_numbers(x))
        
        if 'remove_punctuation' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_punctuation(x))           
        
        if 'tokenize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: tokenize(x))
        
        if 'stopwords' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_stopwords(x))
            
        
        
        return self.df_column

In [64]:
unprocessed_df = df.copy()

In [65]:
steps = ['lower_case','remove_numbers','remove_punctuation',
        'tokenize','stopwords']
processor = Preprocessor()
df['question1'] = processor.preprocess(df['question1'] ,steps)
df['question2'] = processor.preprocess(df['question2'] ,steps)

In [66]:
df.reset_index(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363177 entries, 0 to 363176
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         363177 non-null  int64  
 1   id            363177 non-null  object 
 2   qid1          363177 non-null  object 
 3   qid2          363177 non-null  float64
 4   question1     363177 non-null  object 
 5   question2     363177 non-null  object 
 6   is_duplicate  363177 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 19.4+ MB


### Sentence Matching using pretrained vectors for Word representation - Glove

In [76]:
#download Glove pretrained word vectors and store embeddings in a dictionary
embeddings_dict = {}
with open('../data/glove.6B.300d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [77]:
#example vector embedding
v = embeddings_dict['king']
v.shape

(300,)

In [78]:
#create dataframe of top 100 queries that have label = 1
queries = df[df['is_duplicate'] == 1][:100]
queries = queries[['qid1','question1','qid2','question2', 'is_duplicate']]
queries = queries.reset_index(drop=True)
queries.head(2)

Unnamed: 0,qid1,question1,qid2,question2,is_duplicate
0,536040,"[control, horny, emotions]",536041.0,"[control, horniness]",1.0
1,155721,"[one, mbbs]",7256.0,[mbbs],1.0


In [79]:
responses = df[['qid2','question2']]
responses = responses.drop_duplicates('qid2').reset_index(drop=True)
len(responses)

273121

#### Sentence embedding - Averaging word vectors

In [80]:
class Mean_Vectorizer():
    def __init__(self):
        pass
    
    def fit(self,X):
        return self
        
    def transform(self,embeddings,data):
        self.embeddings = embeddings
        self.data = data
        
        #get dimensionality of word embeddings
        v = self.embeddings['king']
        self.D = v.shape[0]
        
        #zeros matrix for 'question2' embeddings
        X = np.zeros(shape = (len(data),self.D))
        
        #averaging word representations
        for i in range(len(data)):
            words = data[i]
            vectors = []
            for word in words:
                try:
                    v = self.embeddings[word]
                    vectors.append(v)
                except KeyError:
                    pass
            vectors = np.array(vectors)
            if len(vectors) >0:
                sentence_embedding = np.mean(vectors, axis=0).reshape(1,-1)
                X[i,:] = sentence_embedding
            
        return X
       

In [81]:
#define function to cosine similarity between each query and all responses. 
#Rank responses in terms of cosine similarity

def cos_sim(vector1,vector2):
    cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))
    return cosine_similarity

#define function to get unprocessed (original) question
def retreive_unprocessed_question(question_qid, question_type):
    
    """
    Parameters:
        question_qid - 'qid' of question
        question_type - 'response'  or 'query'
    
    Returns:
        unprocessed_question - original question before text preprocessing
    """
    if question_type == 'query':
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid1']==question_qid, 'question1'].iloc[0]
    else:
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid2']==question_qid, 'question2'].iloc[0]
    
    return unprocessed_question


#define function to get/view top 2/5 most matching responses, given a query index
def get_matched_responses(query_index, responses_matrix,queries_matrix,number_of_responses, print_results = False):
    
    """
    Parameters:
        query_index :  0 to 99 - first 100 queries that have label 'is_duplicate' = 1.
    
        responses_matrix : Matrix of sentence embeddings for unique responses, shape = 273121 x 50
    
        queries_matrix : Matrix of sentence embeddings for queries, shape = 100 x 50
    
        number_of_responses : top 2 or top 5 numbers of matching responses against a query
    
        print_results: If true, prints the unprocessed query, ground truth response and top2 or top5 matching responses obtained by sentence embedding method
    
    Returns: 
        matched_responses : list of 'qid2' for top 2 or top 5 matching responses
    
    
    """
    query_qid = queries.loc[query_index,'qid1']
    actual_response_id = queries.loc[query_index,'qid2']
    
    similarities = []
    for r in range(len(responses_matrix)):
        cosine_sim = cos_sim(queries_matrix[query_index,:],responses_matrix[r,:])
        similarities.append(cosine_sim)
    similarities = np.array(similarities)
    similarities = np.nan_to_num(similarities,nan=-9999) #replace nans with large negative number
    sorted_sim = similarities.argsort()[::-1][:number_of_responses]
    
    matched_responses = []
    for response_id in sorted_sim:
        response_qid2 = responses.loc[response_id,'qid2']
        matched_responses.append(response_qid2)
            
        
    if print_results:
        print('Query')
        print(retreive_unprocessed_question(query_qid,'query'))
        print()
        print('Ground truth (actual) response')
        print(retreive_unprocessed_question(actual_response_id,'response'))
        print()
        print('Top {} matching responses'.format(number_of_responses))
        for response in matched_responses:
            print(retreive_unprocessed_question(response,'response'))
    else:        
        
        return matched_responses 




In [85]:
mv = Mean_Vectorizer()
responses_mean = mv.transform(embeddings_dict,responses['question2'])
queries_mean = mv.transform(embeddings_dict,queries['question1'])

In [86]:
get_matched_responses(1,responses_mean,queries_mean,5,True)

  cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))


Query
What can one do after MBBS?

Ground truth (actual) response
What do i do after my MBBS ?

Top 5 matching responses
What do i do after my MBBS ?
What can I do after my MBBs?
What have to Do after MBBS?
Is M.B.B.S a good major?
Which course should I choose after an MBBS?


In [87]:
#calculate accuracy for matched responses for all queries - sentence embedding by word vector averaging
top2_mean = []
top5_mean = []

for query_index in range(len(queries)):
    actual_response_id = queries.loc[query_index,'qid2']
    top5_ranked = get_matched_responses(query_index,responses_mean,queries_mean,5,False)     
    if actual_response_id in top5_ranked[:2]:
        top2_mean.append(1.0)
        top5_mean.append(1.0)
    elif actual_response_id in top5_ranked:
        top2_mean.append(0.0)
        top5_mean.append(1.0)    
    else:
        top2_mean.append(0.0)
        top5_mean.append(0.0)


  cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))


In [91]:
from sklearn.metrics import accuracy_score
top2_mean_acc = accuracy_score(queries['is_duplicate'], top2_mean)
top5_mean_acc = (accuracy_score(queries['is_duplicate'],top5_mean))

print('Accuracy for matched responses in top 2, with averaged word vectors for sentence embedding: {} %'. format(top2_mean_acc *100))
print('Accuracy for matched responses in top 5, with averaged word vectors for sentence embedding: {} %'. format(top5_mean_acc *100))


Accuracy for matched responses in top 2, with averaged word vectors for sentence embedding: 41.0 %
Accuracy for matched responses in top 5, with averaged word vectors for sentence embedding: 54.0 %


### Sentence embedding -  word vectors weighted by TF-IDF

In [111]:
#calculate IDF for words in responses - sklearn implementation
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = list(responses['question2'])


tfsk = TfidfVectorizer(tokenizer=None,analyzer=(lambda x: x), preprocessor=None, lowercase=False)
X_trainsk = tfsk.fit(corpus)

In [112]:
max_idf = max(tfsk.idf_)
max_idf

12.82452668023003

In [114]:
wordIDF = defaultdict(lambda x: max_idf)
wordIDF = {word : tfsk.idf_[i] for word,i in tfsk.vocabulary_.items()}

In [117]:
len(wordIDF)

66626

In [118]:
#zeros matrix for 'question2' embeddings
X = np.zeros(shape = (len(responses['question2']),300))        
#averaging word representations
for i in range(len(responses)):
    words = responses['question2'][i]
    vectors = []
    for word in words:
        try:
            v = embeddings_dict[word] * wordIDF[word]
            vectors.append(v)
        except KeyError:
            pass
    vectors = np.array(vectors)
    if len(vectors) >0:
        sentence_embedding = np.mean(vectors, axis=0).reshape(1,-1)
        X[i,:] = sentence_embedding

In [102]:
class TFIDF_weight_vectorizer():
    def __init__(self):
        pass
    
    def fit(self,X_train):

        tf = TfidfVectorizer(tokenizer=None,analyzer=(lambda x: x), preprocessor=None, lowercase=False)
        tf.fit(X_train)
        max_idf = max(tf.idf_)
        
        self.word_idf = defaultdict(lambda: max_idf)
        self.word_idf = {word : tf.idf_[i] for word,i in tf.vocabulary_.items()}
        
        return self
        
        
    def transform(self,embeddings,data):
        self.embeddings = embeddings
        self.data = data
        
        #get dimensionality of word embeddings
        v = self.embeddings['king']
        self.D = v.shape[0]
        
        #zeros matrix for 'question2' embeddings
        X = np.zeros(shape = (len(data),self.D))
        
        #averaging word representations
        for i in range(len(data)):
            words = data[i]
            vectors = []
            for word in words:
                try:
                    v = self.embeddings[word] * self.word_idf[word]
                    vectors.append(v)
                except KeyError:
                    pass
            vectors = np.array(vectors)
            if len(vectors) >0:
                sentence_embedding = np.mean(vectors, axis=0).reshape(1,-1)
                X[i,:] = sentence_embedding
            
        return X

In [131]:
tf = TFIDF_weight_vectorizer()
responses_tfidf = tf.fit(list(responses['question2']))
responses_tfidf = tf.transform(embeddings_dict,responses['question2'])
queries_tfidf = tf.transform(embeddings_dict,queries['question1'])

In [132]:
get_matched_responses(1,responses_tfidf,queries_tfidf,5,True)

  cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))


Query
What can one do after MBBS?

Ground truth (actual) response
What do i do after my MBBS ?

Top 5 matching responses
What do i do after my MBBS ?
What have to Do after MBBS?
What can I do after my MBBs?
How should I study in MBBS?
Is M.B.B.S a good major?


In [135]:
#calculate accuracy for matched responses for all queries - sentence embedding by averaging tfidf weighted word vectors
top2_tfidf = []
top5_tfidf = []

for query_index in range(len(queries)):
    actual_response_id = queries.loc[query_index,'qid2']
    top5_ranked = get_matched_responses(query_index,responses_tfidf,queries_tfidf,5,False)     
    if actual_response_id in top5_ranked[:2]:
        top2_tfidf.append(1.0)
        top5_tfidf.append(1.0)
    elif actual_response_id in top5_ranked:
        top2_tfidf.append(0.0)
        top5_tfidf.append(1.0)    
    else:
        top2_tfidf.append(0.0)
        top5_tfidf.append(0.0)

  cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))


In [136]:
top2_tfidf_acc = accuracy_score(queries['is_duplicate'], top2_tfidf)
top5_tfidf_acc = (accuracy_score(queries['is_duplicate'],top5_tfidf))

print('Accuracy for matched responses in top 2, with averaged tfidf weighted word vectors for sentence embedding: {} %'. format(top2_tfidf_acc *100))
print('Accuracy for matched responses in top 5, with averaged tfidf weighted word vectors for sentence embedding: {} %'. format(top5_tfidf_acc *100))



Accuracy for matched responses in top 2, with averaged tfidf weighted word vectors for sentence embedding: 41.0 %
Accuracy for matched responses in top 5, with averaged tfidf weighted word vectors for sentence embedding: 52.0 %
