In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import tqdm as tqdm
from collections import defaultdict
import pickle as pickle
import re

def get_tfidf(column):
	tfidf_vectorizer = TfidfVectorizer(stop_words ="english")
	tfidf_matrix = tfidf_vectorizer.fit_transform(column.values.astype(str))
	return tfidf_matrix

def find_similar(tfidf_matrix, index, top_n = 20):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)	

def processTags(temp):
    temp = re.sub(r"<"," ",temp)
    temp = re.sub(r">"," ",temp)
    temp = temp.strip()
    temp = temp.replace("  "," ")
    return temp      
      
df_ques = pd.read_csv("DS_questions.csv")
df_ques['Tags'] = df_ques['Tags'].apply(lambda x:processTags(str(x)))
df_question_merged = df_ques.Body.astype(str) + " " + df_ques.Title.astype(str)+ " "+ df_ques.Tags.astype(str)
saved_column = df_question_merged #you can also use df['column_name']
saved_column.fillna(" ")
tfidf_ds_ques = get_tfidf(saved_column)

ds_answers_rec_dict = defaultdict(list)
for i in tqdm.tqdm(range(saved_column.shape[0])):
    for index, score in find_similar(tfidf_ds_ques, i):
        if df_ques['Id'][i] in ds_answers_rec_dict:
            (ds_answers_rec_dict[df_ques['Id'][i]]).append((df_ques['Id'][index],score))
        else:
            ds_answers_rec_dict[df_ques['Id'][i]] = [(df_ques['Id'][index],score)]

save_obj(ds_answers_rec_dict,"tf_idf_ds_ques_rec")

100%|██████████| 14481/14481 [13:20<00:00, 18.09it/s]


In [0]:
import pandas as pd
postlinksDF = pd.read_csv('postlinks.csv')

In [23]:
import tqdm 
#final list of dict to keep track of related post ids and answer ids for each question id
postLinks = []
#to keep track of questions that were already encountered
posts = [] 

for index, row in tqdm.tqdm(postlinksDF.iterrows()):
    temp = {'postID': None,'relatedID': [],'answerID': [] } 
    temp['postID'] = row['PostId']
    if(row['PostId'] in posts and len(posts)!=0):
        for i in range(0,len(posts)):
            if(row['PostId'] == postLinks[i]['postID']):
                postLinks[i]['relatedID'].append(row["RelatedPostId"])
                postLinks[i]['answerID'].append(row["Id"])
    else:  
        temp['relatedID'].append(row["RelatedPostId"])
        temp['answerID'].append(row["Id"])   
        posts.append(temp['postID'])
        postLinks.append(temp)

5292it [00:32, 162.62it/s]


In [0]:
import pickle as pickle

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

tf_idf_bio_ques_rec = load_obj("tf_idf_bio_ques_rec")
tf_idf_bio_answers_rec = load_obj("tf_idf_bio_answers_rec")

In [0]:
def evaluate(recommended):
    score  = 0
    for i in range(0,len(postLinks)):
        present = 0 
        top = 0
        count = len(postLinks[i]['relatedID'])
        if postLinks[i]['postID'] in recommended.keys():
            idx = 0
            for rec in recommended[postLinks[i]['postID']]:
                idx+=1
                if rec[0] in postLinks[i]['relatedID']:
                    present+=1
                    if (idx >= int(len(postLinks[i]['relatedID'])/2)):
                        top+=1
            if(present):
                score+= (0.3* (present/count)+ 0.7*(top/present))
    return (score/len(postLinks))

In [26]:
evaluate(tf_idf_bio_ques_rec)*100

32.713693636158

In [27]:
evaluate(tf_idf_bio_answers_rec)*100

39.184702874350585

In [0]:
def combine(ans_recommendations,ques_recommendations):
    ques_ans_recommendations = {}
    for key,val in ans_recommendations.items():
        if key in ques_ans_recommendations.keys():
            ques_ans_recommendations[key].extend(ans_recommendations[key])
        else:
            ques_ans_recommendations[key] = ans_recommendations[key]
    for key,val in ques_recommendations.items():
        if key in ques_ans_recommendations.keys():
            ques_ans_recommendations[key].extend(ques_recommendations[key])
        else:
            ques_ans_recommendations[key] = ques_recommendations[key]
    return ques_ans_recommendations

In [0]:
bio_qa_rec = combine(tf_idf_bio_answers_rec,tf_idf_bio_ques_rec)

In [0]:
save_obj(bio_qa_rec,"bio_qa_rec")

In [32]:
evaluate(bio_qa_rec)*100

59.87307907236617

In [18]:
qa_rec[1]

[(6933, 0.36122968190390464),
 (166, 0.3212270470119691),
 (56685, 0.3068955474272252),
 (9991, 0.28159825774901304),
 (46416, 0.2237425413630964),
 (77849, 0.21352466800710193),
 (30029, 0.21184705417649854),
 (1629, 0.19020553948502805),
 (56523, 0.18663062118954066),
 (30220, 0.18589726688663294),
 (1525, 0.18114921720575441),
 (1152, 0.17778282391891317),
 (39257, 0.17466879726981482),
 (38919, 0.17353996877713557),
 (46427, 0.17150704005692913),
 (2112, 0.169968054243188),
 (66442, 0.1668997471942932),
 (7835, 0.15744785349915),
 (54974, 0.15167228355144546),
 (9990, 0.14875541996487895),
 (1736, 0.4798247877202296),
 (9991, 0.46032680848804475),
 (70202, 0.3547780812098047),
 (7333, 0.35350804503472966),
 (39257, 0.34577644460072177),
 (604, 0.34385001634846385),
 (56607, 0.3418567776397122),
 (72832, 0.3388487540654707),
 (56523, 0.33155854804043894),
 (15059, 0.3283176150551441),
 (3515, 0.32398464423667206),
 (56939, 0.3189667372267104),
 (10062, 0.31867161523365817),
 (31094,