In [2]:
import re
import pandas as pd
import traceback
import pickle
import itertools
import spacy
import textdistance
from collections import Counter
from cdifflib import CSequenceMatcher
from tqdm import tqdm

In [3]:
chars=[',',";","\.",":","-","_"]

In [4]:
def rm_splChar(name):
    name = str(name)
    name1 = re.sub(" +","",name)
    regex = "|".join(chars)
    name1 = re.sub(regex,"", name1)
    name2 = re.sub(regex,"", name)
    #val = re.sub('[^A-Za-z]+', '', val)
    return name1, name2

In [5]:
def char_dist(name1, name2):
    name1=rm_splChar(name1)
    name2=rm_splChar(name2)
    return int(Counter(name1)==Counter(name2))

In [6]:
def diff_lib(name1, name2):
    name1=name1.lower()
    name2=name2.lower()
    ratio=CSequenceMatcher(lambda x: x == ' ', name1, name2).ratio()
    return ratio

In [7]:
def jaro_winkler_score(name1, name2):
    jw_score=textdistance.jaro_winkler.normalized_similarity(name1,name2)
    return jw_score

In [8]:
def levenshtein_score(name1, name2):
    leven_score = textdistance.levenshtein.normalized_similarity(name1,name2)
    return leven_score

In [9]:
def hamming_similarity(name1, name2):
    h_score=textdistance.hamming.normalized_similarity(name1,name2)
    return h_score

In [10]:
def jaccard_similarity(name1, name2):
    j_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return j_score

In [11]:
def damerau_levenshtein_similarity(name1, name2):
    dl_score=textdistance.damerau_levenshtein.normalized_similarity(name1,name2)
    return dl_score

In [12]:
def sorensen_dice_similarity(name1, name2):
    sd_score=textdistance.sorensen_dice.normalized_similarity(name1,name2)
    return sd_score

In [13]:
def cosine_similarity(name1, name2):
    c_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return c_score

In [14]:
def calculate_feats(name1, name2):
    sim_score=[]
    #sim_score.append(char_dist(name1, name2))
    sim_score.append(diff_lib(name1,name2))
    #sim_score.append(jaro_winkler_score(name1, name2))
    sim_score.append(levenshtein_score(name1, name2))
    #sim_score.append(hamming_similarity(name1, name2))
    sim_score.append(jaccard_similarity(name1, name2))
    sim_score.append(cosine_similarity(name1, name2))
    sim_score.append(damerau_levenshtein_similarity(name1, name2))
    sim_score.append(sorensen_dice_similarity(name1, name2))
    return sim_score

In [15]:
def lcs(name1, name2):
    match = CSequenceMatcher(None, name1, name2).find_longest_match(0, len(name1), 0, len(name2))
    common_subs=name1[match.a: match.a + match.size]
    name1=re.sub(re.escape(common_subs),"",name1)
    name2=re.sub(re.escape(common_subs),"",name2)
    return name1,name2

In [16]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [17]:
def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [27]:
def thesis_similarity(thesis_df, thresold=5.90):
    unique_thesis = pd.unique(thesis_df['dc.title[]']).copy()
    thesis_dict={}
    count=0
    try:
        for thesis1, thesis2 in tqdm(itertools.combinations(unique_thesis, 2), total=(unique_thesis.shape[0]*(unique_thesis.shape[0]-1))/2):
            thesis_lst1 = set(thesis1.split())
            thesis_lst2 = set(thesis2.split()) 
            score = len(thesis_lst1.intersection(thesis_lst2))/max(len(thesis_lst1),len(thesis_lst2)) 
            print(thesis1+"___"+thesis2+"\n")
            print(score)
            if score > 0.50:
                n1, n10 = rm_splChar(thesis1)
                n2, n20 = rm_splChar(thesis2)
                n11, n21 = lcs(n1, n2)
                n101, n201 = lcs(n10,n20)
                if (len(n11) > 5 and len(n21)>5): 
                    vec1 = calculate_feats(n11, n21)
                    vec2 = calculate_feats(n11.lower(), n21.lower())
                else:
                    vec1 = calculate_feats(n1, n2)
                    vec2 = calculate_feats(n1.lower(), n2.lower())
                if (len(n101)>5 and len(n201)>5): 
                    vec3=calculate_feats(n101, n201)
                    vec4=calculate_feats(n101.lower(), n201.lower())
                else:
                    vec3=calculate_feats(n10, n20)
                    vec4=calculate_feats(n10.lower(), n20.lower())
                print(sum(vec1),sum(vec2),sum(vec3),sum(vec4))
                if (sum(vec1) > thresold) or (sum(vec2) > thresold) or (sum(vec3) > thresold) or (sum(vec4) > thresold) :
                    tid1 = thesis_df[thesis_df['dc.title[]']==thesis1].copy()  #['thesisId'])                
                    tid2 = thesis_df[thesis_df['dc.title[]']==thesis2].copy()  #['thesisId'])
                    dept1 = tid1['DepartmentId'].tolist()
                    dept2 = tid2['DepartmentId'].tolist()
                    inst1 = tid1['instituteId'].tolist()
                    inst2 = tid2['instituteId'].tolist()
                    comm_inst = set(inst1).intersection(inst2)
                    comm_dept = set(dept1).intersection(dept2)
                    if comm_inst and comm_dept :
                        tid11 = pd.unique(tid1['thesisId'])
                        tid21 = pd.unique(tid2['thesisId'])
                        thesis_dict[(tuple(tid11), tuple(tid21),sum(vec1),sum(vec2),sum(vec3), sum(vec4))]=(thesis1, thesis2)
                        count+=1
    except Exception as e:
        print(e)
        traceback.print_exc()
    finally:
        print('No.of similar thesis :'+str(count))
        save_obj(thesis_dict, "similar_thesis_"+str(count))
    return thesis_dict

In [25]:
# if __name__ == "__main__":
#     tqdm.pandas()
#     ment =  pd.read_csv("index_files4/final_ment_w_baseline_gen4.csv")
#     thesis_similarity(ment)
#     #print(a)

In [20]:
ment =  pd.read_csv("index_files4/final_ment_w_baseline_gen4.csv")

In [31]:
ment1=ment[ment['researcher_name']=="Singh, Suman"].copy()

In [32]:
a=thesis_similarity(ment1)

 13%|█▎        | 29/231.0 [00:00<00:00, 289.86it/s]

Gramin shakti sanrachna tatha gramin sttarikaran ke pariwartit pratiman___Gramin samudaye mein satta sanrachna ke parivartit pratiman Janpad Jaunpur ka samajshastriya adhyayan

0.3076923076923077
Gramin shakti sanrachna tatha gramin sttarikaran ke pariwartit pratiman___Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan

0.0
Gramin shakti sanrachna tatha gramin sttarikaran ke pariwartit pratiman___Jaunpur janpad mein bhumi sudhar karyakramon ki pragati Ek mulyankan

0.0
Gramin shakti sanrachna tatha gramin sttarikaran ke pariwartit pratiman___Bharat ke aarthik vikas mein pratyaksh videshi viniyog ki bhumika 1991 ke baad

0.08333333333333333
Gramin shakti sanrachna tatha gramin sttarikaran ke pariwartit pratiman___Land and human resource A geographical study of tehsil Ghazipur District Ghazipur

0.0
Gramin shakti sanrachna tatha gramin sttarikaran ke pariwartit pratiman___Karanda vikas khand Gazipur mein gramya vikas karyakram Ek bhaugolik adhyayan

0.0
Gramin shakti sa

 67%|██████▋   | 154/231.0 [00:01<00:00, 157.19it/s]

5.175983436853002 5.175983436853002 5.303703703703704 5.303703703703704
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Bharat ke aarthik vikas mein pratyaksh videshi viniyog ki bhumika 1991 ke baad

0.16666666666666666
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Land and human resource A geographical study of tehsil Ghazipur District Ghazipur

0.0
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Karanda vikas khand Gazipur mein gramya vikas karyakram Ek bhaugolik adhyayan

0.2
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Bhartiye kavyashastra Aur Kabir ka sahitya

0.0
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Nagarjuna ke katha sahitya mein janwadi chetna

0.1
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Tritya saptak mein samajik evam rashtriye chetna

0.1
Jaunpur janpad mein bhumi sudhar karyakramo ki pragati Ek moolyankan___Tritiya

100%|██████████| 231/231.0 [00:01<00:00, 177.97it/s]



0.0
Vyangyakaar Harishankar Parsai Aur unka rachna karm___Growth response of certain rabi crop plants to Nickel pollution

0.0
Vyangyakaar Harishankar Parsai Aur unka rachna karm___studies on active principles of lipids from some aquatic and terrestrial sources

0.0
Lokgeet parampara Aur prayog___Bulk rheological behaviour of Blood in narrow vessels

0.0
Lokgeet parampara Aur prayog___B Ed Prashiksharthiyon ke pratyakshikrit pratibal star swa samman tatha unke shikshan dakshata ka adhyayan Varanasi Jaunpur evam Ballia Janpad ke sandarbh mein

0.0
Lokgeet parampara Aur prayog___Synthesis characterization and application of gold nanoparticles in biosensors

0.0
Lokgeet parampara Aur prayog___B ed prashikshyanaarthion ke pratyakshikrit pratibal sttar swa samman tatha unke sikshan dakshata ka addhyayan

0.0
Lokgeet parampara Aur prayog___Morpho taxonomic studies on seedlings of medicinal and agri horticultural plants of Varanasi region

0.0
Lokgeet parampara Aur prayog___Growth response o