In [1]:
import re
import json
import fuzzy
import pickle
import jellyfish
import itertools
import pandas as pd
import textdistance
import editdistance
from tqdm import tqdm
from fuzzywuzzy import fuzz
from cdifflib import CSequenceMatcher
from collections import Counter

In [2]:
folder="dataset_v5/"

In [3]:
#Phonetic algorithm (sound based algorithm), Token based algorithm, character based algorithm

In [4]:
ment=pd.read_csv("index_files4/shodhganga_mentorship_dataset_edit_dist_v2.csv",sep=",")

In [5]:
ment.columns

Index(['advisorId', 'researcherId', 'advId', 'resId', 'advisor_name',
       'researcher_name', 'publisher_dept', 'DepartmentId',
       'publisher_institution', 'instituteId', 'dc.title[]', 'thesisId',
       'N_thesisId', 'dc.date.submitted[]', 'dc.date.awarded',
       'dc.subject.ddc', 'dc.description.abstract', 'dc.title.alternative'],
      dtype='object')

In [6]:
chars=[',',";","\.","-",":"]

In [7]:
def lcs(name1, name2):
    match = CSequenceMatcher(None, name1, name2).find_longest_match(0, len(name1), 0, len(name2))
    common_subs=name1[match.a: match.a + match.size]
    name1=re.sub(re.escape(common_subs),"",name1)
    name2=re.sub(re.escape(common_subs),"",name2)
    return name1,name2

In [8]:
def rm_splChar(name):
    name = str(name)
    name1 = re.sub(" +","",name)
    regex = "|".join(chars)
    name1 = re.sub(regex,"", name1)
    name2 = re.sub(regex,"", name)
    #val = re.sub('[^A-Za-z]+', '', val)
    return name1, name2

In [9]:
def char_dist(name1, name2):
    name1=rm_splChar(name1)
    name2=rm_splChar(name2)
    return int(Counter(name1)==Counter(name2))

In [10]:
def diff_lib(name1, name2):
    name1=name1.lower()
    name2=name2.lower()
    ratio=CSequenceMatcher(lambda x: x == ' ', name1, name2).ratio()
    return ratio

In [11]:
def jaro_winkler_score(name1, name2):
    jw_score=textdistance.jaro_winkler.normalized_similarity(name1,name2)
    return jw_score

In [12]:
def levenshtein_score(name1, name2):
    leven_score = textdistance.levenshtein.normalized_similarity(name1,name2)
    return leven_score

In [13]:
def fuzzy_nysiis(name1, name2):
    ny1=fuzzy.nysiis(name1)
    ny2=fuzzy.nysiis(name2)
    if (ny1 or ny2):
        nysiis_score = editdistance.eval(ny1, ny2)/max(len(ny1),len(ny2))
    else:
        nysiis_score = 0
    return nysiis_score

In [14]:
def fuzzy_DMetaphone(name1, name2):
    d1=jellyfish.metaphone(name1)
    d2=jellyfish.metaphone(name2)
    if (d1 or d2):
        meta_score = editdistance.eval(d1, d2)/max(len(d1),len(d2))
    else:
        meta_score = 0
    return meta_score

In [15]:
#Soundex is a phonetic algorithm 
def jellyfish_soundex(name1, name2):
    s1=jellyfish.soundex(name1)
    s2=jellyfish.soundex(name2)
    sound_score = editdistance.eval(s1,s2)/max(len(s1),len(s2))
    return sound_score

In [16]:
def fuzzy_wuzzy(name1, name2):
    fuzz_score=fuzz.token_set_ratio(name1, name2)/100
    return fuzz_score

In [17]:
def hamming_similarity(name1, name2):
    h_score=textdistance.hamming.normalized_similarity(name1,name2)
    return h_score

In [18]:
def jaccard_similarity(name1, name2):
    j_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return j_score

In [19]:
def cosine_similarity(name1, name2):
    c_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return c_score

In [20]:
def damerau_levenshtein_similarity(name1, name2):
    dl_score=textdistance.damerau_levenshtein.normalized_similarity(name1,name2)
    return dl_score

In [21]:
def sorensen_dice_similarity(name1, name2):
    sd_score=textdistance.sorensen_dice.normalized_similarity(name1,name2)
    return sd_score

In [22]:
def calculate_feats(name1, name2):
    sim_score=[]
    #sim_score.append(char_dist(name1, name2))
    sim_score.append(diff_lib(name1,name2))
    sim_score.append(jaro_winkler_score(name1, name2))
    sim_score.append(levenshtein_score(name1, name2))
    sim_score.append(1-fuzzy_nysiis(name1, name2))#distance
    sim_score.append(1-fuzzy_DMetaphone(name1, name2))#distance
    sim_score.append(1-jellyfish_soundex(name1,name2))#distance
    sim_score.append(fuzzy_wuzzy(name1, name2))
    sim_score.append(hamming_similarity(name1, name2))
    sim_score.append(jaccard_similarity(name1, name2))
    sim_score.append(cosine_similarity(name1, name2))
    sim_score.append(damerau_levenshtein_similarity(name1, name2))
    sim_score.append(sorensen_dice_similarity(name1, name2))
    return sim_score

In [23]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [38]:
def name_sim_score(name1, name2,count=1):
    vec_sum=[]
    vec=0
    n1, n10 = rm_splChar(name1)
    #print(n1,n10)
    n2, n20 = rm_splChar(name2)
    #print(n2,n20)
    n11, n21 = lcs(n1, n2)
    #print(n11+"_@_"+n21)
    n101, n201 = lcs(n10,n20)
#     print(n101, n201)
#     print(n11, n21)
#     print(set(n1.lower()).intersection(n2.lower()))
#     check=len(set(n1.lower()).intersection(n2.lower()))/max(len(set(n1)),len(set(n2)))
#     print(check)
    if n11 or n21: 
        vec1 = calculate_feats(n11, n21)
        vec2 = calculate_feats(n11.lower(), n21.lower())
        vec_sum.append(sum(vec1))
        vec_sum.append(sum(vec2))
    else:
        vec1 = calculate_feats(n1, n2)
        vec2 = calculate_feats(n1.lower(), n2.lower())
        vec_sum.append(sum(vec1))
        vec_sum.append(sum(vec2))
    if n101 or n201: 
        vec3=calculate_feats(n101, n201)
        vec4=calculate_feats(n101.lower(), n201.lower())
        vec_sum.append(sum(vec3))
        vec_sum.append(sum(vec4))
    else:
        vec3=calculate_feats(n10, n20)
        vec4=calculate_feats(n10.lower(), n20.lower())
        vec_sum.append(sum(vec3))
        vec_sum.append(sum(vec4))
    if not any(a > 10 for a in vec_sum):
        name1_dist = Counter(name1.lower())
        name2_dist = Counter(name2.lower())
        extra_in_name1 = name1_dist-name2_dist
        extra_in_name2 = name2_dist-name1_dist
        dist_count = extra_in_name1 + extra_in_name2
        n13=n10.lower()
        n23=n20.lower()
        print("hi")
        print(dist_count)

        if (len(dist_count)==1) and ([True if (key.lower() in ['a','e','i','o','u','h'] and val==1) else False for key, val in dist_count.items()][0]) :
            vec5=calculate_feats(name1,name2)
            vec_sum.append(sum(vec5))
        elif (len(extra_in_name1)==1 and len(extra_in_name2)==0) or (len(extra_in_name1)==0 and len(extra_in_name2)==1):
            vec5=calculate_feats(name1,name2)
            vec_sum.append(sum(vec5))
        elif len(dist_count)==2 and any([v in [1,2] for k,v in dist_count.items()]):
            print('yes')
            if any([((pair[0] in dist_count.keys() and pair[1] in dist_count.keys()) or (pair[1] in dist_count.keys() and pair[0] in dist_count.keys())) for pair in char_pairs]):
                vec5=calculate_feats(name1,name2)
                vec_sum.append(sum(vec5)+1)
        elif any([(part in n11) for part in part_of_name]) and any([(part in n21) for part in part_of_name]):
            n14=re.sub("|".join(part_of_name),"",n13)
            n24=re.sub("|".join(part_of_name),"",n23)
            vec5=calculate_feats(n12,n22)
            vec_sum.append(sum(vec5))
        else:
            vec_sum.append(0)
    if (n11.strip()=="" and n21.strip()==""):
        return "same"
    return vec_sum #any(a > 10 for a in vec_sum)#

In [40]:
name_sim_score(name1, name2)

hi
Counter({' ': 2, '.': 2, ',': 1, 'd': 1, 'k': 1})


[0.0, 0.0, 0.0, 0.0, 0]

In [346]:
#['jat','singh','kumar','yadav','bandyopadhyay',"Bharathi"]

In [29]:
part_of_name=["k","kumar"]

In [30]:
# (name1.find(n11) > 2 and ' ' in name_part)
char_pairs=[("i","e"),("u","o"),("i","y"),("u","a")]

In [39]:
name1="Srivastava, D. K."
name2="Srivastava"

In [458]:
name1="Salimath, P. A."
name2="Salimath, P. V."

In [412]:
name1="'Narayana, N. V.'"
name2="Narayana, N."

In [364]:
sum(calculate_feats(name1,name2))

8.269551498760425

In [416]:
name1='Shastri, Paramanand'
name2='Shastri, Pramanand'

In [418]:
name1='Sharma, Sathyaprakash'
name2='Sharma, Satyaprakash'

In [456]:
name1='Dubey, Ajay Kumar'
name2='Dubey, Ajai Kumar'

In [422]:
name1='Bhadwal, Satish Chand'
name2='Badwal, Satish Chand'

In [424]:
name1='Pooja Singh'
name2='Puja Singh'

In [367]:
name1="Rakesh Kumar Jat"
name2="Jat, Rakesh K."
name3="Rakesh Jat"

In [387]:
name1_set=set(name1.lower().replace(",","").split())

In [388]:
name2_set=set(name2.lower().replace(",","").split())

In [389]:
comm_part=name1_set.intersection(name2_set)

In [390]:
name_1_remain=name1_set-comm_part

In [394]:
name1_diff=name1_set-comm_part

In [396]:
name2_diff=name2_set-comm_part

In [399]:
name2_diff

{'k.'}

In [356]:
set("Rakesh Kumar Jat".split()).intersection("Jat Rakesh K.".split())

{'Jat', 'Rakesh'}

In [358]:
sum(calculate_feats(name1,name2))

5.308031135531135

In [276]:
"Sathyaprakash".find('habbc')

-1

In [321]:
name1_dist=Counter(name1)
name2_dist=Counter(name2)
extra_in_name1=name1_dist-name2_dist
extra_in_name2=name2_dist-name1_dist
dist_count = extra_in_name1 + extra_in_name2

In [322]:
dist_count

Counter({'y': 1, 'i': 1})

In [318]:
extra_in_name2

Counter()

In [283]:
list(dist.values())[0]

1

In [261]:
Counter(name1).Counter(name2)

Counter({'y': 1})

In [214]:
#len(dict(Counter(name1)-Counter(name2)).keys())==1 and len(dict(Counter(name1)-Counter(name2)).keys())

In [409]:
name_sim_score(name1, name2)

sharma, sathyaprakash


[9.869736842105263,
 9.869736842105263,
 9.869736842105263,
 9.869736842105263,
 11.051619945095556]

In [247]:
sim_df=pd.read_csv(folder+"disambiguated_names.csv",sep=",")

In [248]:
sim_names=[a[0] for a in sim_df.values]

In [249]:
sim_names[0:5:2]

['Raju, Kv', 'Ajith Kumar, N.', 'Ajithkumar, N.']

In [253]:
check=[]
count=1
for a in zip(sim_names[0::2], sim_names[1::2]):
    tmp=name_sim_score(a[0], a[1])
    check.append(tmp)
    if tmp==False:
        count+=1
        print(a)

In [254]:
check.count("same")

7658

In [435]:
import numpy as np

In [436]:
np.array([1,2,3,4])+1

array([2, 3, 4, 5])

In [110]:
re.sub("[^\u0000-\u05C0\u2100-\u214F]+","","Arivunambi, A. (ĂƒÂ Ă‚Â®Ă‚â€¦ĂƒÂ Ă‚Â®Ă‚Â±ĂƒÂ Ă‚Â®Ă‚Â¿ĂƒÂ Ă‚Â®Ă‚ÂµĂƒÂ Ă‚Â¯Ă‚ï¿½ĂƒÂ Ă‚Â®Ă‚Â¨ĂƒÂ Ă‚Â®Ă‚Â®ĂƒÂ Ă‚Â¯Ă‚ï¿½ĂƒÂ Ă‚Â®Ă‚ÂªĂƒÂ Ă‚Â®Ă‚Â¿, Ăƒâ‚¬Ă‚Â®Ă‚â€¦)")

'Arivunambi, A. (ĂƒÂ\xa0ĂÂ®Ăâ¦ĂƒÂ\xa0ĂÂ®ĂÂ±ĂƒÂ\xa0ĂÂ®ĂÂ¿ĂƒÂ\xa0ĂÂ®ĂÂµĂƒÂ\xa0ĂÂ¯Ăï¿½ĂƒÂ\xa0ĂÂ®ĂÂ¨ĂƒÂ\xa0ĂÂ®ĂÂ®ĂƒÂ\xa0ĂÂ¯Ăï¿½ĂƒÂ\xa0ĂÂ®ĂÂªĂƒÂ\xa0ĂÂ®ĂÂ¿, Ăƒâ¬ĂÂ®Ăâ¦)'