In [20]:
import pandas as pd
import numpy as np
import copy
import itertools
# Package for Levenstein string distance
import jellyfish
import time

In [21]:
path = "C://Users//Dimitri//Desktop//ENSAE3A//NetworkData//Data//"

In [22]:
attrs = pd.read_csv(path + "attrs_sub.csv")

In [23]:
def authors_parser(authors_string, sep=";"):
    authors = authors_string.split("; ")
    for i in range(0, len(authors)) : 
        if authors[i][0] == " ":
            authors[i] = authors[i][1:]
    return authors

In [24]:
attrs["authors_list"] = attrs["authors"].apply(authors_parser)

In [25]:
def normalized_edit_distance(str1, str2):
    dist = jellyfish.levenshtein_distance(str1, str2)
    return dist / max(len(str1), len(str2))

In [26]:
# Fill all authors
authors = []
for author in attrs["authors_list"]:
    authors += author
# Remove duplicates
authors = list(set(authors))

In [27]:
authors

['Wood, John H.',
 'Dasgupta, Sugato',
 'Graziano, Joseph',
 'Christos Cabolis',
 'Follmann, Dean A',
 'Aschauer, David',
 'Fallick, Bruce C',
 'Maskin, E.',
 'Regis Barnichon',
 'Lawrence M. Ausubel',
 'Zimmermann, Tom',
 'Radner, R',
 'McCafferty, Stephen',
 'Stulz, René M.',
 'Ji, Chuanshu',
 'Elizabeth Field-Hendrey',
 'Lipietz, Alain',
 'Frankena, Mark W.',
 'Eren, Ozkan',
 'Dick, Andrew R',
 'Zadrozny, Peter',
 'Apps, Patricia F.',
 'Mavromaras, Kostas G',
 'Van Horne, James C.',
 'Ermini, Luigi',
 'Waugh, Michael E.',
 'Bardos, Mireille',
 'Mona E. Dingle',
 'Schaefer, Mark E',
 'Pablo Selaya',
 'Sankaran, Jayaram K.',
 'Boies Penrose',
 'Abdullah, Dewan A',
 'Ritzen, Jozef M',
 'Perlo, Victor',
 'Elizabeth Frankenberg',
 'Werner Roeger',
 'Noh, Jaesum',
 'Ting, Michael M.',
 'Stephen D. Williamson',
 'Isaac Lippincott',
 'Werner, Therese',
 'Cao, Jie',
 'William C. Strange',
 'Tuzel, Selale',
 'Pollatschek, M A',
 'Lepage, Guillaume',
 'Fernandez, Roque B',
 'Jushan Bai',
 'Imb

In [28]:
df_authors = pd.DataFrame(authors, columns=["original"])

In [29]:
df_authors

Unnamed: 0,original
0,"Wood, John H."
1,"Dasgupta, Sugato"
2,"Graziano, Joseph"
3,Christos Cabolis
4,"Follmann, Dean A"
5,"Aschauer, David"
6,"Fallick, Bruce C"
7,"Maskin, E."
8,Regis Barnichon
9,Lawrence M. Ausubel


In [30]:
def uniformize_names(str1):
    if "," in str1:
        splitted = str1.split(", ")
        if len(splitted) > 1:
            str1bis = splitted[1] + " " + splitted[0]
        else :
            str1bis = str1
        return str1bis
    else:
        return str1

In [31]:
df_authors["uniformat"] = df_authors["original"].apply(uniformize_names)

In [32]:
df_authors_reduced = df_authors.iloc[11000: 12000, :].copy()

In [33]:
def map_authors(auths_df, thresh):
    cleaned_df = pd.DataFrame(columns=["original", "uniformat", "equivalent"])
    for i in auths_df.index:
        author = auths_df.loc[i, "uniformat"]
        author_original = auths_df.loc[i, "original"]
        splitted = author.split(" ")
        last = splitted[-1]
        search = cleaned_df["uniformat"].str.contains(last, regex=False)
        ind_search = search[search == True].index
        first = splitted[0]
        prop = (len(first) == 1) or ((len(first) == 2) and ("." in first))
        exists_similar = False
        for ind in ind_search :
            dist = normalized_edit_distance(cleaned_df["uniformat"][ind], 
                                            author)
            dist_test = dist <= thresh
            if dist_test :
                exists_similar = True
                cleaned_df["equivalent"][ind].append(author_original)    
            first_letter_test1 = prop and (first[0] == cleaned_df["uniformat"][ind][0])
            if first_letter_test1:
                exists_similar = True
                cleaned_df["equivalent"][ind].append(author_original) 
        if not exists_similar:
            df = pd.DataFrame(columns=["original", "uniformat", "equivalent"], index=[0])
            df.set_value(0, "original", author_original)
            df.set_value(0, "uniformat", author)
            df.set_value(0, "equivalent", [author_original])
            cleaned_df = cleaned_df.append(df, ignore_index=True)
        if i%1000 == 0:
            print(i)
    return cleaned_df

In [18]:
start = time.clock()
cleaned = map_authors(df_authors, 0.1)
# cleaned = map_authors(df_authors_reduced, 0.1)
end = time.clock()
print(end - start)

11000
4.963278587850505


In [19]:
cleaned

Unnamed: 0,original,uniformat,equivalent
0,Jacob A. Bikker,Jacob A. Bikker,[Jacob A. Bikker]
1,"Huang, David S",David S Huang,"[Huang, David S]"
2,"Smorodinsky, Meir",Meir Smorodinsky,"[Smorodinsky, Meir]"
3,"Kinal, Terrence W",Terrence W Kinal,"[Kinal, Terrence W]"
4,"Johnson, Ramon E.",Ramon E. Johnson,"[Johnson, Ramon E.]"
5,Neville Francis,Neville Francis,[Neville Francis]
6,Michael H. Riordan,Michael H. Riordan,"[Michael H. Riordan, Riordan, Michael H.]"
7,"Van Rompuy, P.",P. Van Rompuy,"[Van Rompuy, P.]"
8,"Sonnenschein, Hugo",Hugo Sonnenschein,"[Sonnenschein, Hugo]"
9,Nigel Pain,Nigel Pain,[Nigel Pain]


In [125]:
cleaned = cleaned.sort_values(by="uniformat")
cleaned = cleaned.reset_index(drop=True)

In [126]:
cleaned

Unnamed: 0,original,uniformat,equivalent,alpha_order
0,&Lubos Pástor,&Lubos Pástor,"[&Lubos Pástor, Lubos Pástor]",&
1,"Hao, (Grace) Qing",(Grace) Qing Hao,"[Hao, (Grace) Qing]",H
2,1816-1906,1816-1906,[1816-1906],1
3,"Kalay, A",A Kalay,"[Kalay, A]",K
4,"Auquier, A A",A A Auquier,"[Auquier, A A]",A
5,"Ryvkin, A A",A A Ryvkin,"[Ryvkin, A A]",R
6,"Walters, A A",A A Walters,"[Walters, A A]",W
7,"Abhyankar, A",A Abhyankar,"[Abhyankar, A]",A
8,"Schmid, A Allan",A Allan Schmid,"[Schmid, A Allan]",S
9,"Anastasopoulos, A",A Anastasopoulos,"[Anastasopoulos, A]",A


In [129]:
cleaned.to_csv(path + "authors.csv")

ValueError: labels ['alpha_order'] not contained in axis

In [158]:
def unpack_authors_list(authors_df):
    n_eqs = authors_df["equivalent"].apply(lambda x: len(x))
    max_eqs = n_eqs.max()
    for i in range(0, max_eqs):
        authors_df["equivalent_" + str(i)] = np.nan
    for i in authors_df.index:
        for j in range(0, n_eqs[i]):
            authors_df.set_value(i, "equivalent_" + str(j), authors_df.loc[i, "equivalent"][j])
    return authors_df

In [156]:
cleaned = unpack_authors_list(cleaned)

In [170]:
eqs_cols = ["equivalent_" + str(i) for i in range(0, 7)]

In [172]:
def author_corresp_bis(authors_df, eqs_cols, author_list):
    n_eqs = len(eqs_cols)
    authors_nos = []
    for author in author_list :
        found = False
        i = 0
        while (i <= n_eqs) and (not found):
            search = authors_df[eqs_cols[i]].str.contains(author, regex=False)
            ind_search = search[search == True].index
            if len(ind_search) >= 1:
                found = True
                authors_nos.append(ind_search[0])
            i += 1
    return authors_nos

In [178]:
start = time.clock()
attrs["authors_nos"] = attrs["authors_list"].apply(lambda x: author_corresp_bis(cleaned, eqs_cols, x))
end = time.clock()
print(end - start)
# attrs.to_csv(path + "attrs_nos.csv")

4029.402180439425


In [179]:
attrs.to_csv(path + "attrs_nos.csv")

In [180]:
attrs

Unnamed: 0,url,title,authors,date,jel_code,keywords,editor,journal,article_id,authors_list,authors_nos
0,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,The Reaction in Political Economy,Charles F. Dunbar,1886-02-02,,,oup,qjecon,v1y1886i1p1-27..html,[Charles F. Dunbar],[6160]
1,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Private Monopolies and Public Rights,Arthur T. Hadley,1886-02-02,,,oup,qjecon,v1y1886i1p28-44..html,[Arthur T. Hadley],[3334]
2,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Silver Before Congress in 1886,S. Dana Horton,1886-02-02,,,oup,qjecon,v1y1886i1p45-75..html,[S. Dana Horton],[36527]
3,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,"The Arithmetic, Geometric, and Harmonic Means",F. Coggeshall,1886-02-02,,,oup,qjecon,v1y1886i1p83-86..html,[F. Coggeshall],[11962]
4,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Legislation for Labor Arbitration,H. M. Williams,1886-02-02,,,oup,qjecon,v1y1886i1p86-91..html,[H. M. Williams],[15170]
5,https://ideas.repec.org/a/oup/qjecon/v1y1886i1...,Correspondence,Arthur Mangin,1886-02-02,,,oup,qjecon,v1y1886i1p91-102..html,[Arthur Mangin],[3314]
6,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,An Historical Sketch of the Knights of Labor,Carroll D. Wright,1887-02-02,,,oup,qjecon,v1y1887i2p137-168..html,[Carroll D. Wright],[5822]
7,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,The Disposition of Our Public Lands,Albert Bushnell Hart,1887-02-02,,,oup,qjecon,v1y1887i2p169-183..html,[Albert Bushnell Hart],[1003]
8,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,The South-Western Strike of 1886,F. W. Taussig,1887-02-02,,,oup,qjecon,v1y1887i2p184-222..html,[F. W. Taussig],[31109]
9,https://ideas.repec.org/a/oup/qjecon/v1y1887i2...,Marshall's Theory of Value and Distribution,J. Laurence Laughlin,1887-02-02,,,oup,qjecon,v1y1887i2p227-232..html,[J. Laurence Laughlin],[17717]
