In [1]:
#Given a new question, generate recommendations for that given question

#Step 1: Find the word embedding of the question
#We expect the question to have both title and body
#we expect them to receive ques_title and ques_body

import io
import pickle
import numpy as np

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

fasttext = "crawl-300d-2M-subword/crawl-300d-2M-subword.vec"
ft_model = load_vectors(fasttext)

def words_to_vec(words):
    # fasttext vector dimension is 300
    vec = np.zeros(300)
    for word in words:
        if word not in ft_model:
            continue
        else:
            vec += ft_model.get(word)
    vec /= len(words)
    return vec

In [37]:
ques_title = "Do seals eat shrimp?"
ques_body = "Seals are a type of semi-aquatic mammal belonging to the suborder Pinnipedia, which also includes walruses. They spend time on land and breathe air, but move much more easily when in the water. There are over 30 species of seals in the world."





In [38]:

title_embedding = words_to_vec(ques_title.split())
body_embedding = words_to_vec(ques_body.split())

ques_embedding = title_embedding + body_embedding

In [39]:
#Step 2: Find the cluster to which this question belongs to
#based on the distance of question embeddings from the cluster centers

cluster_centers = pickle.load(open("clustering/bio_cluster_centers.dat","rb"))
from sklearn.metrics.pairwise import euclidean_distances

distances = euclidean_distances(cluster_centers,[ques_embedding])
given_ques_cluster = distances.argsort(axis=0)[0][0]

In [11]:
given_ques_cluster[0]

72

In [34]:
import pickle
import pandas as pd
nearby_centers_dict = pickle.load(open("models/bio_nearby_centers_dict.dat","rb"))
questions_in_a_cluster_dict = pickle.load(open("models/bio_questions_in_a_cluster_dict.dat","rb"))
bio_ques_dict = pickle.load(open("clustering/bio_ques_dict.dat","rb"))
input_df = pd.read_csv("csv_files/bio_questions.csv")

In [40]:
#generate recommendations for that question
#find the nearest 5 clusters to the cluster of the question
#accumulate all the questions, run a cosine similarity and output the top 10
from sklearn.metrics.pairwise import cosine_similarity

total_questions = []
recommended_questions = []
cluster_label = given_ques_cluster
nearby_clusters = nearby_centers_dict[cluster_label]
#nearby_clusters is a list
for cluster in nearby_clusters:
    total_questions.extend(questions_in_a_cluster_dict[cluster])
    #total_questions is a list of all questions
    #for a given question find cosine similarity with all these questions
    cosine_scores = []
    for ques in total_questions:
        cosine_embedding = cosine_similarity(ques_embedding.reshape(1,-1),bio_ques_dict[ques][0].reshape(1,-1))
        cosine_scores.append((ques,(cosine_embedding)[0][0]))
    cosine_scores.sort(key=lambda x: x[1],reverse=True)
    for tup in cosine_scores[0:11]:
        #get the ques_id and its title from the dataframe
        #print(tup[0])
        recommended_questions.append(input_df.loc[input_df['Id'] == tup[0]]['Title'])
        

In [41]:
pd.set_option('display.max_colwidth',1000)
recommended_questions

[17631    Do Tardigrades preserve water replace water
 Name: Title, dtype: object, 17377    Do microorganisms contain water
 Name: Title, dtype: object, 9586    Do saltwater fish need drink
 Name: Title, dtype: object, 13222    Do macrophages engulf material epidermis
 Name: Title, dtype: object, 11117    Do fish hold urine deep water
 Name: Title, dtype: object, 18446    Do mosquitos excrete blood
 Name: Title, dtype: object, 6908    Do things contain amylase
 Name: Title, dtype: object, 19686    Do mosquitos vary toxicity
 Name: Title, dtype: object, 4179    Do organisms recycle waste internally
 Name: Title, dtype: object, 17043    Do oranges traces starch
 Name: Title, dtype: object, 9686    Do mitochondria digest fats
 Name: Title, dtype: object, 17631    Do Tardigrades preserve water replace water
 Name: Title, dtype: object, 17377    Do microorganisms contain water
 Name: Title, dtype: object, 15811    Do plants need sleep
 Name: Title, dtype: object, 9586    Do saltwater fish n