Example of training fastText model and getting sentence embeddings

In [1]:
from gensim.models import FastText
from scipy import spatial
import re
from nltk.tokenize import sent_tokenize
import numpy as np

In [2]:
# This method takes in the trained model and the input sentence
# and returns the embedding of the sentence as the average embedding
# of its words
def get_sentence_embedding(model, sentence):

    words = sentence.split(" ")
    vector = 0
    for word in words:
        vector += model.wv[word] 
    return vector/len(words)

Reading Law Stack Exchange Data

In [3]:
import csv
from post_parser_record import PostParserRecord


# Takes in the file path for test file and generate a dictionary
# of question id as the key and the list of question ids similar to it
# as value. It also returns the list of all question ids that have
# at least one similar question
def read_tsv_test_data(file_path):
    
    dic_similar_questions = {}
    lst_all_test = []
    with open(file_path) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            question_id = int(row[0])
            lst_similar = list(map(int, row[1:]))
            dic_similar_questions[question_id] = lst_similar
            lst_all_test.append(question_id)
            lst_all_test.extend(lst_similar)
    return dic_similar_questions, lst_all_test

# Initialize, tune and train FastText model.
#
# lst_sentences - list of strings to train model with
def train_model(lst_sentences):
    
    model = FastText(vector_size = 100, window = 5, negative = 10, min_n=1)
    model.build_vocab(lst_sentences)

    # training the model
    model.train(lst_sentences, total_examples = len(lst_sentences), epochs = 20)

    return model

In [4]:
# Collects and concatenates data from each question. Trains and 
# saves the model.
#
# post_file - name of the xml file to read
# lst_all_test - list all questions that have duplicates
# post_reader - PostReader object
def train_and_save_model(post_file, lst_all_test, post_reader):
    
    lst_training_sentences = []
    for question_id in post_reader.map_questions:
        # collect data from questions that are not in the tsv file
        if question_id in lst_all_test:
            continue            
        question = post_reader.map_questions[question_id]
        title = question.title
        body = question.body
        sentence = title + body
        # Collect sentences here
        lst_answers = question.answers
        # if there are answers to the question 
        # add them to sentence
        if lst_answers is not None:
            for answer in lst_answers:
                answer_body = answer.body
                # add answer
                sentence += answer_body
                # drop html tags
                sentence = re.sub(r"(?s)<.*?>", "", sentence)
        # add sentence to the list of training sentences
        lst_training_sentences.append(sentence)
    #train and save
    model = train_model(lst_training_sentences)         
    model.save("fastText.model") 
    return model

In [5]:
# Provided text and model, calculate and return average embedding
#
# model - FastText model
# text - post string
def get_average_post_embedding(model, text):
    
    sentences = sent_tokenize(text)
    vectors = []
    for sentence in sentences:
        vector = get_sentence_embedding(model, sentence)
        vectors.append(vector)
    average = np.mean(vectors, axis=0)
    
    return average

In [6]:
# Find similar questions for each question in the test data.
# For each question calculate cosine similarity with other questions.
#
# model - FastText model 
# post_reader - PostReader object
# dic_similar_questions 
def get_similar_questions(model, post_reader, dic_similar_questions):
    
    title_dictionary_result = {}
    body_dictionary_result = {}
    
    # finding Similar questions using fastText model
    for test_question_id in dic_similar_questions:

        # for this question you have to find the similar questions 
        test_question = post_reader.map_questions[test_question_id]
        # drop html
        test_title = re.sub(r"(?s)<.*?>", "", test_question.title )
        test_body =  re.sub(r"(?s)<.*?>", "", test_question.body )
        # get embedding for this question
        vec1_title = get_average_post_embedding(model, test_title)
        vec1_body = get_average_post_embedding(model, test_body)
        
        cosine_similarity_title = 0
        id_title = 0
        cosine_similarity_body = 0
        id_body = 0
        
        # for all the questions that aren't this question
        for question_id in post_reader.map_questions:
            # we are not comparing a question with itself
            if question_id == test_question_id:
                continue
            test_question = post_reader.map_questions[question_id]
            # drop html
            test_title = re.sub(r"(?s)<.*?>", "", test_question.title)
            test_body =  re.sub(r"(?s)<.*?>", "", test_question.body)
            # get embedding for this question
            vec2_title = get_average_post_embedding(model, test_title)
            vec2_body = get_average_post_embedding(model, test_body)
            # use the model to calculate the cosine similarity between the questions
            # save the question id with the highest cosine similarity
            # calculating cosine similarity
            result_title = 1 - spatial.distance.cosine(vec1_title, vec2_title)
           
            if result_title > cosine_similarity_title:
                cosine_similarity_title = result_title
                id_title = question_id   
                
            result_body = 1 - spatial.distance.cosine(vec1_body, vec2_body)
          
            if result_body > cosine_similarity_body:
                cosine_similarity_body = result_body
                id_body = question_id 
                
            
        title_dictionary_result[test_question_id] = id_title
        body_dictionary_result[test_question_id] = id_body
    
    return title_dictionary_result, body_dictionary_result

In [None]:
def main():
     

    duplicate_file = "duplicate_questions.tsv"
    post_file = "Posts_law.xml"
    post_reader = PostParserRecord(post_file)
    # load data
    dic_similar_questions, lst_all_test = read_tsv_test_data(duplicate_file)
    
    
    # train and save model
    model = train_and_save_model(post_file, lst_all_test, post_reader)   
    # load model
    #model = FastText.load("fastText.model")
    
    # This dictionary will have the test question id as the key
    # and the most similar question id as the value
    title_dictionary_result, body_dictionary_result = get_similar_questions(model, post_reader, dic_similar_questions)

    p_1 = 0
    for key in title_dictionary_result:
        if title_dictionary_result[key] in dic_similar_questions[key]:
            p_1 += 1
    ave = p_1 / len(title_dictionary_result)
    
    print('Average P@1 for title ', ave)
    
    p_1 = 0
    for key in body_dictionary_result:
        if body_dictionary_result[key] in dic_similar_questions[key]:
            p_1 += 1
    ave = p_1 / len(body_dictionary_result)
    
    print('Average P@1 for body ', ave)
        

main()