In [1]:
import re
import textdistance
import jellyfish
import itertools
import editdistance
import fuzzy
import pandas as pd
from collections import Counter
from cdifflib import CSequenceMatcher
from fuzzywuzzy import fuzz
from tqdm import tqdm
import pickle

In [2]:
import json

In [3]:
#Phonetic algorithm (sound based algorithm), Token based algorithm, character based algorithm

In [12]:
dataset=pd.read_csv("../outputFile_updated.csv",sep=",")

In [3]:
chars=[',',";","\."]

In [4]:
def rm_splChar(name):
    name = str(name)
    name1 = re.sub(" +","",name)
    regex = "|".join(chars)
    name1 = re.sub(regex,"", name1)
    name2 = re.sub(regex,"", name)
    #val = re.sub('[^A-Za-z]+', '', val)
    return name1, name2

In [5]:
def char_dist(name1, name2):
    name1=rm_splChar(name1)
    name2=rm_splChar(name2)
    return int(Counter(name1)==Counter(name2))

In [6]:
def diff_lib(name1, name2):
    name1=name1.lower()
    name2=name2.lower()
    ratio=CSequenceMatcher(lambda x: x == ' ', name1, name2).ratio()
    return ratio

In [7]:
def jaro_winkler_score(name1, name2):
    jw_score=textdistance.jaro_winkler.normalized_similarity(name1,name2)
    return jw_score

In [8]:
def levenshtein_score(name1, name2):
    leven_score = textdistance.levenshtein.normalized_similarity(name1,name2)
    return leven_score

In [9]:
def fuzzy_nysiis(name1, name2):
    ny1=fuzzy.nysiis(name1)
    ny2=fuzzy.nysiis(name2)
    if (ny1 or ny2):
        nysiis_score = editdistance.eval(ny1, ny2)/max(len(ny1),len(ny2))
    else:
        nysiis_score = 0
    return nysiis_score

In [10]:
def fuzzy_DMetaphone(name1, name2):
    d1=jellyfish.metaphone(name1)
    d2=jellyfish.metaphone(name2)
    if (d1 or d2):
        meta_score = editdistance.eval(d1, d2)/max(len(d1),len(d2))
    else:
        meta_score = 0
    return meta_score

In [11]:
#Soundex is a phonetic algorithm 
def jellyfish_soundex(name1, name2):
    s1=jellyfish.soundex(name1)
    s2=jellyfish.soundex(name2)
    sound_score = editdistance.eval(s1,s2)/max(len(s1),len(s2))
    return sound_score

In [12]:
def fuzzy_wuzzy(name1, name2):
    fuzz_score=fuzz.token_set_ratio(name1, name2)/100
    return fuzz_score

In [13]:
def hamming_similarity(name1, name2):
    h_score=textdistance.hamming.normalized_similarity(name1,name2)
    return h_score

In [14]:
def jaccard_similarity(name1, name2):
    j_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return j_score

In [15]:
def cosine_similarity(name1, name2): #repeated jaccard similarity
    c_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return c_score

In [16]:
def damerau_levenshtein_similarity(name1, name2):
    dl_score=textdistance.damerau_levenshtein.normalized_similarity(name1,name2)
    return dl_score

In [17]:
def sorensen_dice_similarity(name1, name2):
    sd_score=textdistance.sorensen_dice.normalized_similarity(name1,name2)
    return sd_score

In [20]:
# name1="Kumar, N. Ajith" 
# name2= "Ajith Kumar, N."

In [58]:
def calculate_feats(name1, name2):
    sim_score=[]
    #sim_score.append(char_dist(name1, name2))
    sim_score.append(diff_lib(name1,name2))
    sim_score.append(jaro_winkler_score(name1, name2))
    sim_score.append(levenshtein_score(name1, name2))
    #sim_score.append(1-fuzzy_nysiis(name1, name2))#distance
    #sim_score.append(1-fuzzy_DMetaphone(name1, name2))#distance
    #sim_score.append(1-jellyfish_soundex(name1,name2))#distance
    #sim_score.append(fuzzy_wuzzy(name1, name2))
    sim_score.append(hamming_similarity(name1, name2))
    sim_score.append(jaccard_similarity(name1, name2))
    sim_score.append(cosine_similarity(name1, name2))
    sim_score.append(damerau_levenshtein_similarity(name1, name2))
    sim_score.append(sorensen_dice_similarity(name1, name2))
    return sim_score

In [30]:
res_names=pd.unique(dataset[['dc.contributor.advisor[]', 'dc.creator.researcher[]']].values.ravel('K'))

In [31]:
# Intial indexing names considering exact match
name_index=pd.DataFrame({'r_names':res_names})
name_index['rid']=name_index.index
dataset["advisorId"]=dataset['dc.contributor.advisor[]'].map(name_index.set_index('r_names')['rid'])
dataset["researcherId"]=dataset['dc.creator.researcher[]'].map(name_index.set_index('r_names')['rid'])

In [32]:
#name_index.to_csv('researcher_index.csv',sep=",",index=False)

In [33]:
#dataset.to_csv('sodhganaga_dataset.csv',sep=",",index=False)

In [34]:
dataset.columns

Index(['advisorId', 'researcherId', 'dc.contributor.advisor[]',
       'dc.creator.researcher[]', 'dc.date.submitted[]', 'dc.language.iso[]',
       'dc.publisher.department[]', 'DepartmentId',
       'dc.publisher.institution[]', 'instituteId', 'dc.title[]', 'thesisId',
       'dc.type.degree', 'dc.date.awarded', 'dc.subject.ddc', 'dc.source.uri',
       'dc.description.abstract', 'dc.title.alternative'],
      dtype='object')

In [26]:
#problem 1 : different names reffering to same person--Name Linking (Problem 2 : Name Resolution Problem)

In [39]:
def lcs(name1, name2):
    match = CSequenceMatcher(None, name1, name2).find_longest_match(0, len(name1), 0, len(name2))
    common_subs=name1[match.a: match.a + match.size]
    name1=re.sub(re.escape(common_subs),"",name1)
    name2=re.sub(re.escape(common_subs),"",name2)
    return name1, name2

In [59]:
name1="Bhartiya grameen samajik sansthaon mein parivartan ki pravritti Ghazipur Janpad ke sadar kshetra panchayat par aadharit ek samaj vaigyanik adhyayan"
name2="Bhartiya grameen saamajik sansthaon men parivartan kee pravitti Ghazipur janpad ke sadr kshetra panchyat par aadharit ek samaj vaigyanik addhyayan"

In [65]:
name1="Bhartiya grameen samajik sansthaon mein parivartan ki pravritti Ghazipur Janpad ke sadar kshetra panchayat par aadharit ek samaj vaigyanik adhyayan"
name2="Bhartiya grameen saamajik sansthaon men parivartan kee pravitti Ghazipur janpad ke sadr kshetra panchyat par aadharit ek samaj vaigyanik addhyayan"

In [96]:
name1='A study of new generalized closed sets in generalized topology (2014-01-01)'
name2='A study of new generalized closed sets in generalized topology (2013-01-01)'#6.84

In [89]:
name1='Theodore Dreiser and the city a study of ambivalent response'   #6.66
name2='Theodore dreiser and the city a study ambivalent response'

In [102]:
name1='Special features of Indian tribal art and its influence on the Contemporary trends of art _ volume II'
name2= 'Special features of Indian tribal art and its influence on the Contemporary trends of art _ volume I' #0.00

In [111]:
name1='Special features of Indian'
name2='Special features of Indian'

In [118]:
name1='Amarkanth Ka Katha Sahithya Ek Vishleshnatmak Adhyayan',
name2='Ravindrakalia Ka Katha Saidtya Ek Vishleshnatmak Adhyayan'

In [28]:
name1="Johri, Alok"
name2="Alok, Johri."

In [119]:
n1, n10 = rm_splChar(name1)
n2, n20 = rm_splChar(name2)

In [120]:
n11, n21 = lcs(n1, n2)

In [121]:
n21

'RavindrakaliaKaKathaSaidt'

In [122]:
vec1 = calculate_feats(n11, n21)
#vec1 = calculate_feats(n1, n2)

In [123]:
sum(vec1)

3.7743950657185956

In [40]:
def find_similar_names(org_df, names_df, index=0, size=50000, start_frm=0) :
    tmp_list=[]
    similar_names = {}
    keep_cnt = 0
    count = 0
    try:
        list_of_pairs=((x,y) for i, x in enumerate(names_df['r_names']) for j, y in enumerate(names_df['r_names'][index:index+size]) if i > j )
        for name1, name2 in tqdm(list_of_pairs, total = names_df['r_names'].shape[0]*size):
        #for name1, name2 in tqdm(itertools.combinations(names_df['r_names'], 2), total=(names_df.shape[0]*(names_df.shape[0]-1))/2):
            keep_cnt += 1
            if keep_cnt > start_frm:
                n1, n10 = rm_splChar(name1)
                n2, n20 = rm_splChar(name2)
                check=len(set(n1).intersection(n2))/max(len(n1),len(n2))
                if check > 0.70:
                    n11, n21 = lcs(n1, n2)
                    n101, n201 = lcs(n10,n20)
                    if (n11 or n21): 
                        vec1 = calculate_feats(n11, n21)
                        vec2 = calculate_feats(n11.lower(), n21.lower())
                        tmp_list.append(vec1)
                    else:
                        vec1 = calculate_feats(n1, n2)
                        vec2 = calculate_feats(n1.lower(), n2.lower())
                        tmp_list.append(vec1)
                    if (n101 or n201): 
                        vec3=calculate_feats(n101, n201)
                        vec4=calculate_feats(n101.lower(), n201.lower())
                    else:
                        vec3=calculate_feats(n10, n20)
                        vec4=calculate_feats(n10.lower(), n20.lower())
                    if (sum(vec1) > 10) or (sum(vec2) > 10) or (sum(vec3) > 10) or (sum(vec4) > 10) :
                        inst1 = org_df[(org_df['dc.contributor.advisor[]'] == name1) | (org_df['dc.creator.researcher[]'] == name1)]["dc.publisher.institution[]"]
                        inst2 = org_df[(org_df['dc.contributor.advisor[]'] == name2) | (org_df['dc.creator.researcher[]'] == name2)]["dc.publisher.institution[]"]
                        dept1 = dataset[(dataset['dc.contributor.advisor[]'] == name1) | (dataset['dc.creator.researcher[]'] == name1)]["dc.publisher.department[]"]
                        dept2 = dataset[(dataset['dc.contributor.advisor[]'] == name2) | (dataset['dc.creator.researcher[]'] == name2)]["dc.publisher.department[]"]
                        common_inst=set(inst1).intersection(inst2)
                        common_dept=set(dept1).intersection(dept2)
                        if common_inst and common_dept:
                            similar_names[keep_cnt]=(name1,name2)
                        count += 1 
            else:
                continue
    except KeyboardInterrupt:
        save_obj(similar_names,str(keep_cnt))
        print("Pairs having similarity value greater than 10: "+str(count))
        return similar_names, tmp_list
    except ZeroDivisionError:
        print('divided by zero error')
        save_obj(similar_names,str(keep_cnt))
        print("Pairs having similarity value greater than 10: "+str(count))
        return similar_names, tmp_list
    print('Done')
    return similar_names, tmp_list

In [2]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [3]:
def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [43]:
#n1,n2=rm_splChar('Kumar, Rajesh')

In [44]:
# sum(calculate_feats(n11, n12))

In [45]:
#lcs('Singh, Rajinder', 'Singh, Harjinder')

In [46]:
# list_of_pairs=((x,y,i,j) for i,x in enumerate(name_index['r_names'][0:9]) for j,y in enumerate(name_index['r_names'][0:4]) if i>j )
# for name1,name2 in tqdm(list_of_pairs, total = 40):
#     print(name1+str(i)+"\t"+name2+str(j))

In [47]:
#name_index.iloc[0:9]

In [36]:
sim_names=find_similar_names(dataset, name_index, 0)

  3%|▎         | 885850526/28427174461.0 [11:45:43<365:41:24, 20920.35it/s]

Pairs having similarity value greater than 10: 2118





In [48]:
sim_names,tmp_list=find_similar_names(dataset, name_index, 0, 100000, 0)

  0%|          | 5221357/23844200000 [06:29<493:35:01, 13416.04it/s]


Pairs having similarity value greater than 10: 307


In [None]:
save_obj(sim_names,'check1')

In [49]:
len(tmp_list)

2980

In [50]:
df = pd.DataFrame(tmp_list)

In [51]:
df.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.785661,0.819476,0.836272,0.813429,0.450987,0.996675,0.72233,0.84169,0.84169,0.823181,0.846324
1,0.785661,1.0,0.908196,0.756821,0.715844,0.316803,0.794582,0.824986,0.879397,0.879397,0.912017,0.888108
2,0.819476,0.908196,1.0,0.79374,0.783497,0.468151,0.818728,0.906428,0.867268,0.867268,0.992312,0.838403
3,0.836272,0.756821,0.79374,1.0,0.886218,0.507636,0.836343,0.737527,0.734273,0.734273,0.796372,0.708075
4,0.813429,0.715844,0.783497,0.886218,1.0,0.576974,0.812468,0.73924,0.692281,0.692281,0.786665,0.658426
5,0.450987,0.316803,0.468151,0.507636,0.576974,1.0,0.438588,0.555751,0.300507,0.300507,0.46567,0.236879
6,0.996675,0.794582,0.818728,0.836343,0.812468,0.438588,1.0,0.720666,0.850887,0.850887,0.822696,0.857321
7,0.72233,0.824986,0.906428,0.737527,0.73924,0.555751,0.720666,1.0,0.760553,0.760553,0.90128,0.720557
8,0.84169,0.879397,0.867268,0.734273,0.692281,0.300507,0.850887,0.760553,1.0,1.0,0.888005,0.981887
9,0.84169,0.879397,0.867268,0.734273,0.692281,0.300507,0.850887,0.760553,1.0,1.0,0.888005,0.981887


In [4]:
s_names=load_obj('120000_118442.pkl')

In [7]:
len(s_names)

780

In [6]:
s_names

{24628218326: ('D.Prakash', 'Prakash, D.'),
 24628220882: ('Muthukumar V', 'V.Muthukumar'),
 24628233588: ('GANESHKUMAR M', 'GANESHKUMAR, M.'),
 24628244802: ('Aruna.V', 'Aruna V'),
 24628591933: ('Ajantha J', 'Ajantha.J'),
 24628611716: ('Indra D', 'Indra.D'),
 24628816596: ('PAUL, D. JOHN', 'JOHN PAUL D'),
 24629159791: ('DEVI, S.', 'DEVI S'),
 24629163957: ('ALEXANDER, M.', 'ALEXANDER.M'),
 24629452100: ('PADMA PRIYA C', 'PRIYA, C. PADMA'),
 24629540695: ('Sampath N', 'Sampath, N'),
 24629810103: ('Adarsh C', 'Adarsh, C.'),
 24629819094: ('Rajeswari C', 'Rajeswari, C.'),
 24629880860: ('Rais, Khan', 'Khan, Rais'),
 24630908609: ('ARUN Y', 'ARUN.Y'),
 24631214731: ('Anjum,Sabista', 'Anjum, Sabista'),
 24631373925: ('Bhargav, Amit', 'Bharghav, Amit'),
 24631419453: ('Gupta, Sadhna', 'Gupta,Sadhna'),
 24631612520: ('Kaushik,Sachin', 'Kaushik, Sachin'),
 24631752443: ('Santhakumar, K', 'Santhakumar.K'),
 24632470304: ('Nayan, Talukdar', 'Talukdar, Nayan'),
 24632476122: ('Dipsikha, Kali

In [99]:
sim_names

{57: ('Raju, Kv', 'Raju, K. V.'),
 75: ('Ajith Kumar, N.', 'Kumar, N. Ajith'),
 115: ('Ajithkumar, N.', 'Kumar, N. Ajith'),
 382954: ('Narendran T. C.', 'Narendran, T. C.'),
 759412: ('Dominic, V J.', 'Dominic, V. J.')}

In [40]:
def gen_newIndex(org_dataset,names, similar_n) :  
    for key in similar_n:
        index=names[names['r_names'].isin(similar_n[key])]['rid'].values
        names.loc[names['r_names'].isin(similar_n[key]),'rid']=min(index)
    org_dataset["advId_1"]=org_dataset['dc.contributor.advisor[]'].map(names.set_index('r_names')['rid'])
    org_dataset["studId_1"]=org_dataset['dc.creator.researcher[]'].map(names.set_index('r_names')['rid'])
    print('Done')

In [55]:
len(sim_names)

725

In [41]:
gen_newIndex(dataset, name_index, sim_names)

Done


In [56]:
name_index[name_index['rid'].isin(name_index[name_index["rid"].duplicated()]['rid'].values)].shape

(1158, 2)

In [78]:
#dataset[dataset["dc.contributor.advisor[]"].str.contains('E. M.')]

In [93]:
name_index[name_index["rid"]==6]

Unnamed: 0,r_names,rid
6,"Dominic, V. J.",6
75946,"Dominic, V J.",6


In [None]:
#name_index.iloc[201000:,:].values

In [None]:
# ['Siddiraju, Rakesh Jat And Sridhar', 426], 
# ['Rakesh Jat And Siddiraju, Sridhar', 426],
# ['Chacko, Rakesh K. Jat And Annam', 427],
# ['Mohan, Rakesh Jat And S.', 428],
# ['Baheti, Rakesh Kumar Jat And Jagdish R.', 429],
# ['Jat, S. Mohan And Rakesh', 430],
# ['Rkjat, Sridhar Siddiraju And', 431],
# ['Kale, Rakesh Jat And Mohan K.', 432],
# ['Nirmal, Rakesh Jat And Sunil Ashokrao', 433],

In [None]:
# ['Gaurav Sharma, Nakuleshwar Dut Jasuja', 520],
# ['Jasuja, Rajesh Kumar Yadav / Nakuleshwar Dut', 521],
# ['Gaurav Sharma, Rajesh Kumar Yadav', 522],
# ['Vijay Singh Rathore , Ripu Rajan Sinha', 523],
# ['Dp Sharma , Bright Keshwani', 524],
# ['Asha Sharma, Gaurav Sharma, Pushpa Mehta', 525],
# ['Bbaskara Varma Thirupad K', 200035],

In [9]:
#name_index.iloc[10:30]

In [20]:
def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [10]:
lcs('Senthil Kumar J', 'Senthil Kumar S')

('J', 'S')

In [106]:
name_index.shape

(238442, 2)

In [10]:
dict1=load_obj("0_205235.pkl")#0_205235.pkl

In [5]:
#dict1

In [11]:
dict2=load_obj("237941500.pkl")

In [6]:
#dict1

In [14]:
dict3=load_obj("1000.pkl")

In [4]:
#dict3

In [7]:
second_batch_dict1 = load_obj("result/30106524461.pkl")

In [17]:
len(second_batch_dict1)

236

In [15]:
second_batch_dict2 = load_obj("result/18844150000.pkl")

In [16]:
len(second_batch_dict2)

2477

In [11]:
check_dict1 = load_obj("result1/1320725.pkl")

In [12]:
check_dict1

{42: ('Raju, Kv',
  11,
  'Raju, K. V.',
  1,
  5.0,
  12.0,
  4.166666666666666,
  6.833333333333333)}

In [19]:
second_batch_dict2

{4851: ('Ahmed, K. Basheer',
  'Ahamed, K. Basheer',
  8.122222222222224,
  8.122222222222224,
  8.122222222222224,
  8.122222222222224),
 5050: ('Ali, Muzaffer',
  'Ali, K. Muzaffer',
  9.157619047619049,
  9.157619047619049,
  8.116666666666665,
  8.116666666666665),
 39058: ('Godson Asirvatham L.',
  'Godwon Asirvatham L.',
  8.75,
  8.75,
  8.75,
  8.75),
 120294: ('Chandrasekar, K. S.',
  'Chandrasekhar, K. S.',
  9.05111111111111,
  9.05111111111111,
  9.48043956043956,
  9.48043956043956),
 212851: ('Charjan A. P.',
  'Chargan, A. P.',
  9.216666666666669,
  9.216666666666669,
  9.470000000000002,
  9.470000000000002),
 233584: ('Shallni, Nigma',
  'Shalini, Nigam',
  9.348095238095238,
  9.348095238095238,
  9.348095238095238,
  9.348095238095238),
 319585: ('Chirapanath, A. K.',
  'Chirappanath, A. K.',
  9.878181818181817,
  9.878181818181817,
  9.878181818181817,
  9.878181818181817),
 328454: ('Bhanwar, Ajs',
  'Bhanwer, Ajs',
  9.250000000000002,
  9.250000000000002,
  9.6

In [21]:
second_batch= Merge(second_batch_dict2, second_batch_dict1)

In [23]:
mod_sec_batch={}

In [24]:
for key in second_batch:
    mod_sec_batch[key]=(second_batch[key][0],second_batch[key][1])

In [25]:
mod_sec_batch

{4851: ('Ahmed, K. Basheer', 'Ahamed, K. Basheer'),
 5050: ('Ali, Muzaffer', 'Ali, K. Muzaffer'),
 39058: ('Godson Asirvatham L.', 'Godwon Asirvatham L.'),
 120294: ('Chandrasekar, K. S.', 'Chandrasekhar, K. S.'),
 212851: ('Charjan A. P.', 'Chargan, A. P.'),
 233584: ('Shallni, Nigma', 'Shalini, Nigam'),
 319585: ('Chirapanath, A. K.', 'Chirappanath, A. K.'),
 328454: ('Bhanwar, Ajs', 'Bhanwer, Ajs'),
 370230: ('Gambhir, Yogesh', 'Ghambir, Yogesh'),
 403631: ('Kaur, Sumanjit', 'Kaur, Damanjit'),
 417233: ('Dhindsa, Parammjeet Kaur', 'Dhindsa, Paramjeet Kaur'),
 444148: ('Mary Joseph T.', 'Joseph, Mary'),
 471905: ('Joshi, Namrata', 'Joshi, Namarta'),
 615494: ('Jahangirdar D. V.', 'Janhagirdar D. V.'),
 618809: ('Pachkawade A. H.', 'Pachkawde A. H.'),
 935001: ('Majumdar, M. N.', 'Majumder, M. N.'),
 963898: ('Ganguly, Nemi C.', 'Ganguli, Nemai C.'),
 1121252: ('Singh, Meenakshi', 'Singh, Menakshi'),
 1151403: ('Kunwar, Neelma', 'Kunvar, Neelam'),
 1152899: ('Sing, Meenakshi', 'Singh,

In [27]:
with open('Disambiguated_names_2.txt', 'w') as file:
     file.write(json.dumps(mod_sec_batch))