In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import tqdm as tqdm
from collections import defaultdict
import pickle as pickle
import re

def get_tfidf(column):
	tfidf_vectorizer = TfidfVectorizer(stop_words ="english")
	tfidf_matrix = tfidf_vectorizer.fit_transform(column.values.astype(str))
	return tfidf_matrix

def find_similar(tfidf_matrix, index, top_n = 20):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)	

def processTags(temp):
    temp = re.sub(r"<"," ",temp)
    temp = re.sub(r">"," ",temp)
    temp = temp.strip()
    temp = temp.replace("  "," ")
    return temp      
      
df_ques = pd.read_csv("ds_questions.csv")
df_ques['Tags'] = df_ques['Tags'].apply(lambda x:processTags(str(x)))
df_question_merged = df_ques.Body.astype(str) + " " + df_ques.Title.astype(str)+ " "+ df_ques.Tags.astype(str)


In [2]:
saved_column =  df_question_merged#you can also use df['column_name']
saved_column.fillna(" ")
tfidf_bio_ques = get_tfidf(saved_column)

saved_column.head()

0    In prokaryotic translation critical efficient ...
1    Does anyone suggestions prevent RNAse contamin...
2    Tortora writes Principles Anatomy Physiology L...
3    Various people lab prepare liter LB add kanamy...
4    Are cases splicing machinery constructs mRNA e...
dtype: object

In [3]:
ds_ques_rec_dict = defaultdict(list)
for i in tqdm.tqdm(range(saved_column.shape[0])):
    for index, score in find_similar(tfidf_bio_ques, i):
        if df_ques['Id'][i] in bio_ques_rec_dict:
            (bio_ques_rec_dict[df_ques['Id'][i]]).append((df_ques['Id'][index],score))
        else:
            bio_ques_rec_dict[df_ques['Id'][i]] = [(df_ques['Id'][index],score)]

100%|██████████| 21594/21594 [19:53<00:00, 18.10it/s]


In [0]:
save_obj(ds_ques_rec_dict,"tf_idf_ds_ques_rec")

In [0]:
answers_file = "ds_answers.csv"
df = pd.read_csv(answers_file)

saved_column = df.Body #you can also use df['column_name']
saved_column.fillna(" ")
tfidf_bio_answers = get_tfidf(saved_column)

saved_column.head()

0    You need careful everything comes contact samp...
1    Here tips I routinely wipe surfaces including ...
2    Did try centrifuge tube got push liquid bottom...
3    A quite safe way shipping plasmids put filter ...
4    Lundholt et al describe simple trick let plate...
Name: Body, dtype: object

In [0]:
df.head()

Unnamed: 0,Id,ParentId,Score,Body,CommentCount
0,12,2,14,You need careful everything comes contact samp...,0
1,14,2,13,Here tips I routinely wipe surfaces including ...,0
2,17,9,13,Did try centrifuge tube got push liquid bottom...,0
3,18,9,18,A quite safe way shipping plasmids put filter ...,3
4,20,13,13,Lundholt et al describe simple trick let plate...,0


In [0]:
ds_answers_rec_dict = defaultdict(list)
for i in tqdm.tqdm(range(saved_column.shape[0])):
    for index, score in find_similar(tfidf_bio_answers, i):
        if df['ParentId'][i] in bio_answers_rec_dict:
            (bio_answers_rec_dict[df['ParentId'][i]]).append((df['ParentId'][index],score))
        else:
            bio_answers_rec_dict[df['ParentId'][i]] = [(df['ParentId'][index],score)]

In [0]:
save_obj(ds_answers_rec_dict,"tf_idf_ds_answers_rec")

In [0]:
import pandas as pd
postlinksDF = pd.read_csv('ds_postlinks.csv')

In [7]:
import tqdm 
#final list of dict to keep track of related post ids and answer ids for each question id
postLinks = []
#to keep track of questions that were already encountered
posts = [] 

for index, row in tqdm.tqdm(postlinksDF.iterrows()):
    temp = {'postID': None,'relatedID': [],'answerID': [] } 
    temp['postID'] = row['PostId']
    if(row['PostId'] in posts and len(posts)!=0):
        for i in range(0,len(posts)):
            if(row['PostId'] == postLinks[i]['postID']):
                postLinks[i]['relatedID'].append(row["RelatedPostId"])
                postLinks[i]['answerID'].append(row["Id"])
    else:  
        temp['relatedID'].append(row["RelatedPostId"])
        temp['answerID'].append(row["Id"])   
        posts.append(temp['postID'])
        postLinks.append(temp)

1307it [00:01, 1038.30it/s]


In [0]:
import pickle as pickle

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

tf_idf_ds_ques_rec = load_obj("tf_idf_ds_ques_rec")
tf_idf_ds_answers_rec = load_obj("tf_idf_ds_answers_rec")

In [0]:
def evaluate(recommended):
    score  = 0
    for i in range(0,len(postLinks)):
        present = 0 
        top = 0
        count = len(postLinks[i]['relatedID'])
        if postLinks[i]['postID'] in recommended.keys():
            idx = 0
            for rec in recommended[postLinks[i]['postID']]:
                idx+=1
                if rec[0] in postLinks[i]['relatedID']:
                    present+=1
                    if (idx >= int(len(postLinks[i]['relatedID'])/2)):
                        top+=1
            if(present):
                score+= (0.3* (present/count)+ 0.7*(top/present))
    return (score/len(postLinks))

In [13]:
evaluate(tf_idf_ds_ques_rec)*100

32.48457411133468

In [14]:
evaluate(tf_idf_ds_answers_rec)*100

38.21673373574787

In [0]:
def combine(ans_recommendations,ques_recommendations):
    ques_ans_recommendations = {}
    for key,val in ans_recommendations.items():
        if key in ques_ans_recommendations.keys():
            ques_ans_recommendations[key].extend(ans_recommendations[key])
        else:
            ques_ans_recommendations[key] = ans_recommendations[key]
    for key,val in ques_recommendations.items():
        if key in ques_ans_recommendations.keys():
            ques_ans_recommendations[key].extend(ques_recommendations[key])
        else:
            ques_ans_recommendations[key] = ques_recommendations[key]
    return ques_ans_recommendations

In [0]:
ds_qa_rec = combine(tf_idf_ds_answers_rec,tf_idf_ds_ques_rec)

In [0]:
save_obj(ds_qa_rec,"ds_qa_rec")

In [18]:
evaluate(ds_qa_rec)*100

61.56515761234071