-
Notifications
You must be signed in to change notification settings - Fork 0
/
thesis_sim1.py
74 lines (61 loc) · 2.59 KB
/
thesis_sim1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import traceback
import spacy
from tqdm import tqdm
import pickle
import itertools
import re
import argparse
nlp=spacy.load('en_core_web_lg')
tqdm.pandas()
def thesis_pre_processing(name):
name = str(name)
name = re.sub(" +","",name)
#name = re.sub(regex,"", name)
#val = re.sub('[^A-Za-z]+', '', val)
return name
def save_obj(obj, name ):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
with open(name, 'rb') as f:
return pickle.load(f)
def thesis_similarity(thesis_df, thresold=0.85, index=0, size=50000):
thesis_dict={}
count=0
try:
list_of_pairs=((x, y) for i, x in enumerate(thesis_df['dc.title[]']) for j, y in enumerate(thesis_df['dc.title[]'][index:index+size]) if (i > j+index))
for thesis1, thesis2 in tqdm(list_of_pairs, total = thesis_df['dc.title[]'].shape[0]*size):
#for thesis1, thesis2 in tqdm(itertools.combinations(thesis_df['dc.title[]'], 2), total=(thesis_df.shape[0]*(thesis_df.shape[0]-1))/2):
thesis11 = str(thesis1).split(" ")
thesis21 = str(thesis2).split(" ")
common_len = len(set(thesis11).intersection(thesis21))/ max(len(thesis11),len(thesis21))
if common_len > 0.50:
#print(common_len)
score = nlp(thesis1).similarity(nlp(thesis2))
if score > thresold:
tid1=pd.unique(thesis_df[thesis_df['dc.title[]']==thesis1]['thesisId'])
tid2=pd.unique(thesis_df[thesis_df['dc.title[]']==thesis2]['thesisId'])
thesis_dict[(tuple(tid1),tuple(tid2),score)]=(thesis1, thesis2)
count+=1
else:
continue
except Exception as e:
print(e)
traceback.print_exc()
finally:
print('No.of similar thesis :'+str(count))
save_obj(thesis_dict, "./similar_thesis/similar_thesis_"+str(index)+"_"+str(index+size))
#print(thesis_dict)
return
if __name__ == "__main__":
ment = pd.read_csv("index_files4/final_mod_ment_w_baseline_gen4.csv")
#ment = ment[ment['researcherId']==186818].copy()
ment['dc.title[]'].fillna("Not_Appl",inplace=True)
parser = argparse.ArgumentParser(description='Thesis Similarity')
parser.add_argument('--index', default = 0, type=int, help="Enter the value to start from")
parser.add_argument('--size', default = ment.shape[0], type=int, help="Batch size")
args = parser.parse_args()
thesis_similarity(ment, 0.85, args.index, args.size)