In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


In [130]:
unprocessed_df = pd.read_pickle('../data/unprocessed_dataframe')

In [131]:
unprocessed_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [132]:
embeddings_dict = {}
with open('../data/glove.6B.50d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [133]:
#example vector embedding
v = embeddings_dict['king']
v.shape

(50,)

In [134]:
#lower casing
def lower_case(text):
    return text.lower()


#remove numbers
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

# remove punctuation
import string
def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p


#tokenize text
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words


#remove stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stopwords(text):
    
    text_p = " ".join([word for word in text.split() if word not in stop_words])
    
    return text_p

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text.split()]
    text_p = " ".join(lemmatized)
    return text_p




[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [135]:
class Preprocessor():
    def __init__(self):
        pass
    
    def preprocess(self,df_column, steps):
        self.df_column = df_column
        self.steps = steps
    
        if 'lower_case' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lower_case(x))
        
        if 'remove_numbers' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_numbers(x))
        
        if 'remove_punctuation' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_punctuation(x))           
        
        if 'tokenize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: tokenize(x))
        
        if 'stopwords' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_stopwords(x))
            
        if 'single_characters' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_single_characters(x))
        
        if 'stemming' in self.steps:
            self.df_column = self.df_column.apply(lambda x: stem_words(x))
            
        if 'lemmatize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lemmatize(x))    
            
        return self.df_column

In [136]:
df = unprocessed_df.copy()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363177 entries, 0 to 363176
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            363177 non-null  object 
 1   qid1          363177 non-null  object 
 2   qid2          363177 non-null  float64
 3   question1     363177 non-null  object 
 4   question2     363177 non-null  object 
 5   is_duplicate  363177 non-null  float64
dtypes: float64(2), object(4)
memory usage: 16.6+ MB


In [137]:
steps = ['lower_case','remove_numbers','remove_punctuation',
        'tokenize','stopwords']
processor = Preprocessor()

processor = Preprocessor()
df['question1'] = processor.preprocess(df['question1'] ,steps)
df['question2'] = processor.preprocess(df['question2'] ,steps)

AttributeError: 'list' object has no attribute 'split'

In [None]:
df.head(2)

In [None]:
#create dataframe of top 100 queries that have label = 1
queries = df[df['is_duplicate'] == 1][:100]
queries = queries[['qid1','question1','qid2','question2', 'is_duplicate']]
queries = queries.reset_index(drop=True)
queries.head(2)

In [None]:
class Mean_Vectorizer():
    def __init__(self):
        pass
    
    def fit(self,X):
        return self
        
    def transform(self,embeddings,data):
        self.embeddings = embeddings
        self.data = data
        #get dimensionality of word embeddings
        v = self.embeddings['king']
        self.D = v.shape[0]
        
        X = np.zeros(shape = (len(data),self.D))
        
        for i in range(len(data)):
            words = data[i]
            vectors = []
            for word in words:
                try:
                    v = self.embeddings[word]
                    vectors.append(v)
                except KeyError:
                    pass
            vectors = np.array(vectors)
            if len(vectors) >0:
                sentence_embedding = np.mean(vectors, axis=0).reshape(1,-1)
                X[i,:] = sentence_embedding
            
        return X
    


In [None]:
class TFIDF_Vectorizer():
    
    def __init__(self):
        pass
    
    def fit(self,X):
        tf = TfidfVectorizer(analyzer= lambda x:x, preprocessor = lambda x: x, lower_case=False, norm = 'l2')
        tf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        
query_questions = list(queries['question1'])



train_questions = list(corpus['question2'])
query_questions = list(queries['question1'])


def dummy(text):
    return text
        
    
        

In [None]:
corpus = df[['qid2','question2']]
corpus = corpus.drop_duplicates('qid2').reset_index(drop=True)
len(corpus)

In [None]:
mv = Mean_Vectorizer()
X_train_mean = mv.transform(embeddings_dict,corpus['question2'])
X_test_mean = mv.transform(embeddings_dict,queries['question1'])

In [None]:
#calculate cosine similarity between each query and all responses. 
#Rank responses in terms of cosine similarity

def cos_sim(vector1,vector2):
    cosine_similarity = np.dot(vector1,vector2)/(np.linalg.norm(vector1,ord=2)*np.linalg.norm(vector2,ord=2))
    return cosine_similarity

def retreive_unprocessed_question(question_qid, question_type):
    if question_type == 'query':
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid1']==question_qid, 'question1'].iloc[0]
    else:
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid2']==question_qid, 'question2'].iloc[0]
    
    return unprocessed_question


def get_matched_responses(query_index, X_train,X_test,number_of_responses, print_results = False):
    
    query_qid = queries.loc[query_index,'qid1']
    actual_response_id = queries.loc[query_index,'qid2']
    
    similarities = []
    for r in range(len(X_train)):
        cosine_sim = cos_sim(X_test[query_index,:],X_train[r,:])
        similarities.append(cosine_sim)
    similarities = np.array(similarities)
    similarities = np.nan_to_num(similarities,nan=-9999) #replace nans with large negative number
    sorted_sim = similarities.argsort()[::-1][:number_of_responses]
    
    matched_responses = []
    for corpus_id in sorted_sim:
        response_qid2 = corpus.loc[corpus_id,'qid2']
        matched_responses.append(response_qid2)
            
        
    if print_results:
        print('Query')
        print(retreive_unprocessed_question(query_qid,'query'))
        print()
        print('Ground truth (actual) response')
        print(retreive_unprocessed_question(actual_response_id,'response'))
        print()
        print('Top {} matching responses'.format(number_of_responses))
        for response in matched_responses:
            print(retreive_unprocessed_question(response,'response'))
    else:        
        
        return matched_responses 



In [None]:
top2_mean = []
top5_mean = []

for query_index in range(len(queries)):
    actual_response_id = queries.loc[query_index,'qid2']
    top5_ranked = get_matched_responses(query_index,X_train_mean,X_test_mean,5,False)     
    if actual_response_id in top5_ranked[:2]:
        top2_mean.append(1.0)
        top5_mean.append(1.0)
    elif actual_response_id in top5_ranked:
        top2_mean.append(0.0)
        top5_mean.append(1.0)    
    else:
        top2_mean.append(0.0)
        top5_mean.append(0.0)
        
        

In [None]:
len(top2_mean)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(queries['is_duplicate'],top2_mean))
print(accuracy_score(queries['is_duplicate'],top5_mean))