In [2]:
import csv
from post_parser_record import PostParserRecord
from scipy import spatial


def read_tsv_test_data(file_path):
    # Takes in the file path for test file and generate a dictionary
    # of question id as the key and the list of question ids similar to it
    # as value. It also returns the list of all question ids that have
    # at least one similar question
    dic_similar_questions = {}
    lst_all_test = []
    with open(file_path) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            question_id = int(row[0])
            lst_similar = list(map(int, row[1:]))
            dic_similar_questions[question_id] = lst_similar
            lst_all_test.append(question_id)
            lst_all_test.extend(lst_similar)
    # print(dic_similar_questions)
    # print(lst_all_test)
    return dic_similar_questions, lst_all_test
#read_tsv_test_data(duplicate_questions.tsv")



from gensim.models import FastText

def get_sentence_embedding(model, sentence):
  # This method takes in the trained model and the input sentence
  # and returns the embedding of the sentence as the average embedding
  # of its words
  words = sentence.split(" ")
  vector = model.wv[words[0]]
  for i in range(1, len(words)):
    vector += model.wv[words[i]]
  return vector/len(words)

def train_model(lst_sentences):
    model = FastText( window=5, min_n=1)
    model.build_vocab(lst_sentences)
    model.train(lst_sentences, total_examples=len(lst_sentences), epochs=10)
    return model

def main():
    duplicate_file = "duplicate_questions.tsv"
    post_file = "Posts_law.xml"
    dic_similar_questions, lst_all_test = read_tsv_test_data(duplicate_file)
    post_reader = PostParserRecord(post_file)
    lst_training_sentences = []
    print(lst_training_sentences)
    for question_id in post_reader.map_questions:
        if question_id in lst_all_test:
            continue
        question = post_reader.map_questions[question_id]
        title = question.title
        body = question.body
        # Collect sentences here
        for sentence in title.split('.'):
            lst_training_sentences.append(sentence.strip())
        for sentence in body.split('.'):
            lst_training_sentences.append(sentence.strip())
        lst_answers = question.answers
        if lst_answers is not None:
            for answer in lst_answers:
                answer_body = answer.body
                # Collection sentences here
                for sentence in answer_body.split('.'):
                    lst_training_sentences.append(sentence.strip())

    # train your model
    model = train_model(lst_training_sentences)

    # This dictionary will have the test question id as the key
    # and the most similar question id as the value
    dictionary_result = {}

     #finding Similar questions using fastText model
     for test_question_id in dic_similar_questions:
         # for this question you have to find the similar questions
         test_question = post_reader.map_questions[test_question_id]
         test_title = test_question.title
         test_body = test_question.body
         max_similarity = 0
         most_similar_question_id = -1
         for question_id in post_reader.map_questions:
             # we are not comparing a question with itself
             if question_id == test_question_id:
                 continue
             question = post_reader.map_questions[question_id]
             title = question.title
             body = question.body
             # use your model and calculate the cosine similarity between the questions
             similarity_title = 1 - spatial.distance.cosine(get_sentence_embedding(model, test_title),
                                                            get_sentence_embedding(model, title))


main()


[0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0
 0 0 1 0 1 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 0 1 0 0 1 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 0 0
 1 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 1 1
 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 1 0 1 1 0 1
 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 0 0 1
 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0
 0 0 0 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0]
