In [1]:
import numpy as np

def LevenshteinDistance(s1,s2):
    """calcul the distance of Levenshtein bewteen s1 and s2"""
    
    D = np.zeros(( len(s1)+1 , len(s2)+1 ), dtype = int)
    
    for i in range( len(s1)+1 ):
        D[i][0] = i
    
    for j in range( len(s2)+1 ):
        D[0][j] = j
    
    #print(D)
    
    for i in range(1, len(s1)+1):
        for j in range(1, len(s2)+1):
            if s1[i-1] == s2[j-1]:
                cost = 0
            else:
                cost = 1
            D[i][j] = min( D[i-1][j]+1, D[i][j-1]+1, D[i-1][j-1]+cost )
            
    #print(D)
    
    return D[len(s1)][len(s2)]

In [2]:
def fragmentComparison(s1,s2,lim):
    """
    Require : String s1 and s2
    Require : A threshold value lim
    Ensure : Are s1 and s2 compatible ?
    """
    
    #split by ' '
    c1 = s1.split(' ')
    c2 = s2.split(' ')
    
    n1 = len(c1)
    n2 = len(c2)
    
    #compare the first fragment
    if ( len(c1[0])>1  and len(c2[0])>1 ):
        if ( LevenshteinDistance(c1[0],c2[0]) > lim ):
            #print(1)
            return False
    else:
        if ( len(c1[0])>1 ):
            if (c1[0][0] != c2[0]):
                #print(2)
                return False
        else:
            if (c1[0] != c2[0][0]):
                #print(3)
                return False
    
    #compare the last fragment
    if ( len( c1[n1-1] )>1 and len( c2[n2-1] )>1 ):
        if ( LevenshteinDistance(c1[n1-1], c2[n2-1]) > lim):
            #print(4)
            return False
    else:
        #print(5)
        return False
    
    #compare the rest fragments
    Mark_c1 = [False] * n1
    Mark_c2 = [False] * n2
    for i in range(1, n1-1):
        for j in range(1, n2-1):
            if ( len( c1[i] )>1 and len( c2[j] )>1 and LevenshteinDistance(c1[i], c2[j])<lim ):
                Mark_c1[i] = True
                Mark_c2[j] = True
    
    for i in range(1, n1-1):
        for j in range(1, n2-1):
            if ( (not(Mark_c1[i])) and len( c1[i] )>1 and len(c2[j]) == 1 and c1[i][0] == c2[j]):
                Mark_c1[i] = True
                Mark_c2[j] = True
    
    for i in range(1, n1-1):
        for j in range(1, n2-1):
            if ( (not(Mark_c2[j])) and len( c2[j] )>1 and len(c1[i]) == 1 and c1[i] == c2[j][0]):
                Mark_c1[i] = True
                Mark_c2[j] = True
    
    for i in range(1, n1-1):
        for j in range(1, n2-1):
            if ( (not(Mark_c1[i])) and (not(Mark_c2[j])) and len(c1[i]) == 1 and len( c2[j] ) == 1  and c1[i] == c2[j]):
                Mark_c1[i] = True
                Mark_c2[j] = True
    
    #check whether at least one string has all fragments marked
    for i in range(1, n1-1):
        if ( (not(Mark_c1[i])) ):
            for j in range(1, n2-1):
                if( (not(Mark_c2[j])) ):
                    #print(6)
                    return False
    
    return True

In [3]:
pip install json_stream

Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import json_stream

def sortShortAndLongNameRecords(g):
    """
    Require: Ambiguous group g (the file json g opened)
    Ensure: List S of clusters of authorship records (write a file json S)
    """
    outputShort_name = ".\shortNameRecordsOf" + g.name.strip(".\\")
    outputShort = open(outputShort_name,'w+',encoding="ISO-8859-1")
    outputLong_name = ".\longNameRecordsOf" + g.name.strip(".\\")
    outputLong = open(outputLong_name,'w+',encoding="ISO-8859-1")
    
    outputShort.write('[')
    outputLong.write('[')
    
    authorshipRecords = json_stream.load(g)
    
    author = ""
    coauthors = []
    publicationYear = ""
    enTitle = ""
    frTitle = ""
    Dict = {}
    
    nb_longName = 0
    nb_shortName = 0

    for authorshipRecord in authorshipRecords :
    
        author = authorshipRecord["author"]
        
        #test the name is in short format or not
        L_author = author.split(' ')
        output = outputLong
        nb_longName += 1
        for i in range(len(L_author)):
            #if the i-th fragement of the name is shorter than 2
            if (len(L_author[i]) <= 2):
                output = outputShort
                
                if (nb_shortName != 0):
                    outputShort.write(',')
                    
                nb_shortName += 1
                nb_longName -= 1
                break
                
        if (output == outputLong and nb_longName != 0):
            outputLong.write(',')
        coauthors = [ coauthor for coauthor in authorshipRecord["coauthors"] ]
        publicationYear = authorshipRecord["publicationYear"]
        enTitle = authorshipRecord["enTitle"]
        frTitle = authorshipRecord["frTitle"]
        
        Dict = {}
        Dict["author"] = author
        Dict["coauthors"] = coauthors
        Dict["publicationYear"] = publicationYear
        Dict["enTitle"] = enTitle
        Dict["frTitle"] = frTitle
        #print(Dict)
        json.dump(Dict, output, ensure_ascii=False, indent=4)
    
    outputShort.write(']')
    outputLong.write(']')
    
    outputShort.close()
    outputLong.close()
    
    print("Il y a", nb_longName, "Authorship Record de long nom")
    print("Il y a", nb_shortName, "Authorship Record de court nom")

In [5]:
g = open(r".\authorshipRecords_20000.json",'r',encoding="ISO-8859-1")

sortShortAndLongNameRecords(g)

g.close()

Il y a 295 Authorship Record de long nom
Il y a 51 Authorship Record de court nom


In [6]:
import json
import json_stream
import os

def processList (A, Ci):
    """
    Require: list A of authorship records (the file json A opened)
    Require: list Ci of authorship record clusters named by the first author name
    Ensure: list C0 of authorship record clusters named by the first author name
    
    """
    
    lim = 2 #threshold value lim of LevenshteinDistance
    
    authorshipRecords = json_stream.load(A)
    
    C0 = Ci
    for authorshipRecord in authorshipRecords:
        
        author = authorshipRecord["author"]
        coauthors = [ coauthor for coauthor in authorshipRecord["coauthors"] ]
        publicationYear = authorshipRecord["publicationYear"]
        enTitle = authorshipRecord["enTitle"]
        frTitle = authorshipRecord["frTitle"]
        
        Dict = {}
        Dict["author"] = author
        Dict["coauthors"] = coauthors
        Dict["publicationYear"] = publicationYear
        Dict["enTitle"] = enTitle
        Dict["frTitle"] = frTitle
        
        inserted = False
        i = 0
        while (not inserted and i < len(C0)):
            c = C0[i]
            #if the author name from a is similar with author name from the first authorship record of c
            if (fragmentComparison(author,c,lim)):
                #if it exists a coauthor name in a that is similar with some coauthor name in c
                file_check = open(".\\firstStep\\" + str(i) + "authorshipRecordCluster.json", 'r', encoding="ISO-8859-1")
                authorshipRecords_check = json_stream.load(file_check)
                coauthors_check = [] #the list of the coauthor names in c
                
                for authorshipRecord_check in authorshipRecords_check:
                    for coauthor_check in authorshipRecord_check["coauthors"]:
                        coauthors_check.append(coauthor_check)
                        
                file_check.close()
                
                for j in range(len(coauthors)): #the j-th coauthor of authorship record a
                    coauthorCompared = coauthors[j]
                    for l in range(len(coauthors_check)): #the l-th coauthor 
                        if (fragmentComparison(coauthorCompared,coauthors_check[l],lim)):
                            
                            #delete ']'
                            file_write = open(".\\firstStep\\" + str(i) + "authorshipRecordCluster.json", 'rb+')
                            file_write.seek(-1, os.SEEK_END)
                            file_write.truncate()
                            file_write.close()

                            file_write = open(".\\firstStep\\" + str(i) + "authorshipRecordCluster.json", 'a', encoding="ISO-8859-1")
                            file_write.write(',')
                            json.dump(Dict, file_write, ensure_ascii=False, indent=4)
                            file_write.write(']')
                            file_write.close()

                            inserted = True
                            break
                    else:
                        continue
                    break
                
            i += 1
        #a new cluster is created with this authorship record a
        if (not inserted):
            file_write = open(".\\firstStep\\" + str(len(C0)) + "authorshipRecordCluster.json", 'w', encoding="ISO-8859-1")
            file_write.write('[')
            json.dump(Dict, file_write, ensure_ascii=False, indent=4)
            file_write.write(']')
            file_write.close()
            
            C0.append(author)
    return C0

In [7]:
L = open(r".\longNameRecordsOfauthorshipRecords_20000.json",'r',encoding="ISO-8859-1")
C1 = []
C2 = processList(L,C1)
L.close()

S = open(r".\shortNameRecordsOfauthorshipRecords_20000.json",'r',encoding="ISO-8859-1")
C3 = processList(S,C2)
print(C3)
S.close()

['Dcns', 'Keruel Bernard', 'Pierre Nicolas', 'Obriot Bruno', 'Badra Mohamad', 'Borghol Badra Rouba', 'Hajjeh Ibrahim', 'Gardan Remy', 'Panaget', 'Touze Jean Marie', 'Thuaire Micheline', 'Thuaire Raymond', "Al'Stom Transport Teknolodzhis", 'Dagdag Selim', 'Boccard', 'Grgoire Philippe', "D'Hayer Benoît", 'Tibi Annick', 'Husson Marie-Caroline', 'Boudy Vincent', 'Dcns', 'Niot Stephane', 'Vouillat Jerome', 'Salles Bernard', 'Jullien Denis', 'Universite Paul Sabatier Toulouse Iii', 'Mirey Gladys', 'Vignard Julien', 'Benoist Jean-Claude', 'Chapel Julien', 'Moignard Jeremy', 'Boucher Mathieu', 'Jastrzebski Alain', 'Bricaud Herve', 'Injection Haute Precision', 'Djian Francis', 'Proteor', 'Bellon Bernard', 'Thales', 'Queinnec Jean-Yves', 'Valeo Systemes Thermiques', 'Aircelle', 'Bardy Julien', 'Duchamp Boris', 'Veyet Frederick', 'Msika Philippe', 'Legran Zhak', 'Laboratoires Expanscience', 'Garnier Sebastien', 'Expanscience Lab', 'Msika Filipp', 'Legrand Jacques', 'Auger Aurelien', 'Alcatel Luce

In [8]:
len(C3)

251