In [1]:
import re
import json
import fuzzy
import pickle
import jellyfish
import itertools
import pandas as pd
import textdistance
import editdistance
from tqdm import tqdm
from fuzzywuzzy import fuzz
from collections import Counter
from cdifflib import CSequenceMatcher

In [2]:
#"index_files4/shodhganga_mentorship_dataset_edit_dist_v2.csv"

In [3]:
#folder="dataset_v5/v5_2/"

In [4]:
folder = '../dataset_v5/v5_2/v5_2_2/'

In [5]:
#Phonetic algorithm (sound based algorithm), Token based algorithm, character based algorithm

In [6]:
ment=pd.read_csv(folder+"final_shodhganga_dataset_v5_2_7_1.csv",sep=",")

In [7]:
ment.columns

Index(['advisorId', 'researcherId', 'advId', 'resId', 'advisor_name',
       'researcher_name', 'publisher_dept', 'DepartmentId',
       'publisher_institution', 'instituteId', 'title', 'thesisId',
       'N_thesisId', 'date_submitted', 'new_date_awarded', 'dc.date.awarded',
       'dc.subject.ddc', 'uniq_ddc_across_dept', 'dc.description.abstract',
       'dc.title.alternative', 'N_DepartmentId_1', 'ddc_code', 'advisor_advId',
       'researcher_resId'],
      dtype='object')

In [8]:
chars=[',',";","\.",":","-","\)","\(","_"]

In [9]:
def lcs(name1, name2):
    match = CSequenceMatcher(None, name1, name2).find_longest_match(0, len(name1), 0, len(name2))
    common_subs=name1[match.a: match.a + match.size]
    name1=re.sub(re.escape(common_subs),"",name1)
    name2=re.sub(re.escape(common_subs),"",name2)
    return name1,name2

In [10]:
def rm_splChar(name):
    name = str(name)
    name1 = re.sub(" +","",name)
    regex = "|".join(chars)
    name1 = re.sub(regex,"", name1)
    name2 = re.sub(regex,"", name)
    #val = re.sub('[^A-Za-z]+', '', val)
    return name1, name2

In [11]:
def char_dist(name1, name2):
    name1=rm_splChar(name1)
    name2=rm_splChar(name2)
    return int(Counter(name1)==Counter(name2))

In [12]:
def diff_lib(name1, name2):
    name1=name1.lower()
    name2=name2.lower()
    ratio=CSequenceMatcher(lambda x: x == ' ', name1, name2).ratio()
    return ratio

In [13]:
def jaro_winkler_score(name1, name2):
    jw_score=textdistance.jaro_winkler.normalized_similarity(name1,name2)
    return jw_score

In [14]:
def levenshtein_score(name1, name2):
    leven_score = textdistance.levenshtein.normalized_similarity(name1,name2)
    return leven_score

In [15]:
def fuzzy_nysiis(name1, name2):
    ny1=fuzzy.nysiis(name1)
    ny2=fuzzy.nysiis(name2)
    if (ny1 or ny2):
        nysiis_score = editdistance.eval(ny1, ny2)/max(len(ny1),len(ny2))
    else:
        nysiis_score = 0
    return nysiis_score

In [16]:
def fuzzy_DMetaphone(name1, name2):
    d1=jellyfish.metaphone(name1)
    d2=jellyfish.metaphone(name2)
    if (d1 or d2):
        meta_score = editdistance.eval(d1, d2)/max(len(d1),len(d2))
    else:
        meta_score = 0
    return meta_score

In [17]:
#Soundex is a phonetic algorithm 
def jellyfish_soundex(name1, name2):
    s1=jellyfish.soundex(name1)
    s2=jellyfish.soundex(name2)
    sound_score = editdistance.eval(s1,s2)/max(len(s1),len(s2))
    return sound_score

In [18]:
def fuzzy_wuzzy(name1, name2):
    fuzz_score=fuzz.token_set_ratio(name1, name2)/100
    return fuzz_score

In [19]:
def hamming_similarity(name1, name2):
    h_score=textdistance.hamming.normalized_similarity(name1,name2)
    return h_score

In [20]:
def jaccard_similarity(name1, name2):
    j_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return j_score

In [21]:
def cosine_similarity(name1, name2):
    c_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return c_score

In [22]:
def damerau_levenshtein_similarity(name1, name2):
    dl_score=textdistance.damerau_levenshtein.normalized_similarity(name1,name2)
    return dl_score

In [23]:
def sorensen_dice_similarity(name1, name2):
    sd_score=textdistance.sorensen_dice.normalized_similarity(name1,name2)
    return sd_score

In [24]:
def calculate_feats(name1, name2):
    sim_score=[]
    #sim_score.append(char_dist(name1, name2))
    sim_score.append(diff_lib(name1,name2))
    sim_score.append(jaro_winkler_score(name1, name2))
    sim_score.append(levenshtein_score(name1, name2))
    sim_score.append(1-fuzzy_nysiis(name1, name2))#distance
    sim_score.append(1-fuzzy_DMetaphone(name1, name2))#distance
    sim_score.append(1-jellyfish_soundex(name1,name2))#distance
    sim_score.append(fuzzy_wuzzy(name1, name2))
    sim_score.append(hamming_similarity(name1, name2))
    sim_score.append(jaccard_similarity(name1, name2))
    sim_score.append(cosine_similarity(name1, name2))
    sim_score.append(damerau_levenshtein_similarity(name1, name2))
    sim_score.append(sorensen_dice_similarity(name1, name2))
    return sim_score

In [25]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [26]:
def name_sim_score(name1, name2):
    vec_sum=[]
    n1, n10 = rm_splChar(name1)
    #print(n1,n10)
    n2, n20 = rm_splChar(name2)
    #print(n2,n20)
    n11, n21 = lcs(n1, n2)
    n101, n201 = lcs(n10,n20)
    #print(n101, n201)
    #print(n11, n21)
    #print(set(n1.lower()).intersection(n2.lower()))
    #check=len(set(n1.lower()).intersection(n2.lower()))/max(len(set(n1)),len(set(n2)))
    #print(check)
    if n11 and n21: 
        vec1 = calculate_feats(n11, n21)
        vec2 = calculate_feats(n11.lower(), n21.lower())
        vec_sum.append(sum(vec1))
        vec_sum.append(sum(vec2))
    else:
        vec1 = calculate_feats(n1, n2)
        vec2 = calculate_feats(n1.lower(), n2.lower())
        vec_sum.append(sum(vec1))
        vec_sum.append(sum(vec2))
    if n101 and n201: 
        vec3=calculate_feats(n101, n201)
        vec4=calculate_feats(n101.lower(), n201.lower())
        vec_sum.append(sum(vec3))
        vec_sum.append(sum(vec4))
    else:
        vec3=calculate_feats(n10, n20)
        vec4=calculate_feats(n10.lower(), n20.lower())
        vec_sum.append(sum(vec3))
        vec_sum.append(sum(vec4))
    if (n11.strip()=='' and n21.strip()=='') or (n101.strip()=='' and n201.strip()==''):
        vec_sum.append(12)
    return vec_sum

In [27]:
name1='Jaya P' 
name2='Jaya P' #check for Shastri in other versions

In [28]:
#set(name1)-set(name2)

In [29]:
name_sim_score(name1,name2)

[12.0, 12.0, 12.0, 12.0, 12]

In [30]:
#sum(calculate_feats("aramanand", "ramanand"))

In [31]:
ment.columns

Index(['advisorId', 'researcherId', 'advId', 'resId', 'advisor_name',
       'researcher_name', 'publisher_dept', 'DepartmentId',
       'publisher_institution', 'instituteId', 'title', 'thesisId',
       'N_thesisId', 'date_submitted', 'new_date_awarded', 'dc.date.awarded',
       'dc.subject.ddc', 'uniq_ddc_across_dept', 'dc.description.abstract',
       'dc.title.alternative', 'N_DepartmentId_1', 'ddc_code', 'advisor_advId',
       'researcher_resId'],
      dtype='object')

In [32]:
#name_sim_score(name1, name2)

In [33]:
#ment = ment[(ment["N_thesisId"]=='T42107') | (ment["advId"].isin([65824,65829]))].copy()

In [34]:
uniq_thesisId = pd.unique(ment['N_thesisId'])

In [35]:
for tid in tqdm(uniq_thesisId):
    tmp = ment[ment['N_thesisId']==tid].copy()
    uniq_rid = pd.unique(tmp['resId'])
    sim_lst=[]
    if uniq_rid.shape[0] > 1:
        #print(uniq_names)
        #print(uniq_rid)
        uniq_names = pd.unique(tmp['researcher_name'])
        names_comb = itertools.combinations(uniq_names, 2)
        for name1, name2 in names_comb:
            #print(max(name_sim_score(name1, name2)))
            if max(name_sim_score(name1, name2)) > 8:
                sim_lst.append(name1)
                sim_lst.append(name2)
            else:
                continue
        if len(sim_lst) > 0:
            #print(sim_lst)
            studid = pd.unique(ment[(ment['N_thesisId']==tid) & (ment["researcher_name"].isin(sim_lst))]['resId'])
            tmp_id=[]
            for sid in studid :
                if ment[ment['advId']==sid].empty:
                    continue
                else:
                    tmp_id.append(pd.unique(ment[ment['advId']==sid]['advId'])[0])
            if len(tmp_id)==1:
                ment.loc[((ment['N_thesisId']==tid) & (ment["researcher_name"].isin(sim_lst))),'resId'] = tmp_id[0]
            elif len(tmp_id) == 0:
                tmp_id1 = min(ment[(ment['N_thesisId']==tid) & (ment["researcher_name"].isin(sim_lst))]['resId'].tolist())
                ment.loc[((ment['N_thesisId']==tid) & (ment["researcher_name"].isin(sim_lst))),'resId'] = tmp_id1
            else :
                ment.loc[((ment['N_thesisId']==tid) & (ment["researcher_name"].isin(sim_lst))),'resId'] = min(tmp_id)
                ment.loc[(ment["advId"].isin(tmp_id)),'advId'] = min(tmp_id)
                print("Check for this thesis id : "+tid)
        else:
            continue
    else:
        continue

100%|██████████| 189086/189086 [33:59<00:00, 92.69it/s]


In [57]:
#manual correction 
#ment.loc[((ment['advId']==50955)&(ment["advisor_name"].isin(['Singh, Anjani Kumar','Singh, Anjanee Kumar']))),'advId']= max(ment['advId'].max(),ment['resId'].max())+1

In [61]:
ment.to_csv(folder+"final_shodhganga_dataset_v5_2_7_2.csv", sep=",", index=False)

In [35]:
#uniq_thesisId[0]

In [37]:
ment[ment['resId'].isin([96037,96056])]

Unnamed: 0,advisorId,researcherId,advId,resId,advisor_name,researcher_name,publisher_dept,DepartmentId,publisher_institution,instituteId,...,new_date_awarded,dc.date.awarded,dc.subject.ddc,uniq_ddc_across_dept,dc.description.abstract,dc.title.alternative,N_DepartmentId_1,ddc_code,advisor_advId,researcher_resId
2458,1183,96037,1183,96037,"Muthamizhchelvan, C.","Archana, J.",department of physics,D1787,SRM University,I47,...,2011-07-01,,500::Natural sciences & mathematics|530::Physics,"('500', '530')",,,D1787,"('500', '530')","Muthamizhchelvan, C.@1183","Archana, J.@96037"
2467,1188,96056,1184,96056,Ponnusamy S.,Anbuchezhiyan M,department of physics,D1787,SRM University,I47,...,2010-07-09,,500::Natural sciences & mathematics|530::Physics,"('500', '530')",,,D1787,"('500', '530')",Ponnusamy S.@1184,Anbuchezhiyan M@96056


In [38]:
ment[ment['advId'].isin([7361,7366])]

Unnamed: 0,advisorId,researcherId,advId,resId,advisor_name,researcher_name,publisher_dept,DepartmentId,publisher_institution,instituteId,...,new_date_awarded,dc.date.awarded,dc.subject.ddc,uniq_ddc_across_dept,dc.description.abstract,dc.title.alternative,N_DepartmentId_1,ddc_code,advisor_advId,researcher_resId
17156,7361,109235,7361,109235,"Anand Shastri, P.","Kumari, Asha",department of sanskrit,D453,Aligarh Muslim University,I292,...,1983-07-01,,400::Language|490::Other languages|491::East I...,"('400', '490', '491')",,,D1989,"('400', '490', '491')","Anand Shastri, P.@7361","Kumari, Asha@109235"
17168,7366,109248,7366,109248,"Sharma, S. R.","Ali, Shakir",department of sanskrit,D453,Aligarh Muslim University,I292,...,1982-07-01,,400::Language|490::Other languages|491::East I...,"('400', '490', '491')",,,D1989,"('400', '490', '491')","Sharma, S. R.@7366","Ali, Shakir@109248"


In [39]:
ment[ment['N_thesisId']=="T42107"]

Unnamed: 0,advisorId,researcherId,advId,resId,advisor_name,researcher_name,publisher_dept,DepartmentId,publisher_institution,instituteId,...,new_date_awarded,dc.date.awarded,dc.subject.ddc,uniq_ddc_across_dept,dc.description.abstract,dc.title.alternative,N_DepartmentId_1,ddc_code,advisor_advId,researcher_resId
17112,7339,7364,7339,7328,"Shastri, Paramanand","Sharma, Satyaprakash",department of sanskrit,D453,Aligarh Muslim University,I292,...,1978-07-01,,400::Language|490::Other languages|491::East I...,"('400', '490', '491')",,,D1989,"('400', '490', '491')","Shastri, Paramanand@7339","Sharma, Satyaprakash@7328"


In [33]:
#ment[ment['advId']==88889]