In [930]:
import pdfplumber
import unicodedata
import pandas as pd
import re





In [931]:
# Function to extract text from PDF into a dataset of strings
ACCENT_MAP = {
    "ˇ": "\u030C",  # caron
    "´": "\u0301",  # acute
    "`": "\u0300",  # grave
    "¨": "\u0308",  # diaeresis
    "ˆ": "\u0302",  # circumflex
    "~": "\u0303",  # tilde
}

def repair_spacing_accents(text):
    # convert spacing accents -> combining accents
    for spacing, combining in ACCENT_MAP.items():
        text = re.sub(rf"([A-Za-z]){re.escape(spacing)}", rf"\1{combining}", text)

    # recombine into proper characters
    return unicodedata.normalize("NFC", text)

def extract_pdf_to_strings(pdf_path):
    """
    Extract text from each page of a PDF and return as a list of strings.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    
    Returns:
    list: List of strings, one per page
    """
    strings = []
    
    with pdfplumber.open(pdf_path) as pdf:
        print(f"Total pages: {len(pdf.pages)}")
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text is not None:
                text = unicodedata.normalize("NFC", text)
                text = repair_spacing_accents(text)
            strings.append(text)
            print(f"Extracted page {i+1}")
    
    return strings

In [932]:
strings = extract_pdf_to_strings("AuthorList.pdf")

Total pages: 11
Extracted page 1
Extracted page 2
Extracted page 3
Extracted page 2
Extracted page 3
Extracted page 4
Extracted page 5
Extracted page 4
Extracted page 5
Extracted page 6
Extracted page 7
Extracted page 8
Extracted page 6
Extracted page 7
Extracted page 8
Extracted page 9
Extracted page 10
Extracted page 11
Extracted page 9
Extracted page 10
Extracted page 11


In [933]:
# removing papers on first page
cutoff = strings[0].find('B.P.Abbott1')
strings[0] = strings[0][cutoff:]
first_name = strings[0][0:10]
print(first_name)

B.P.Abbott


In [934]:
strings[0]

'B.P.Abbott1, R.Abbott1, T.D.Abbott2, F.Acernese3,4, K.Ackley5,6, C.Adams7, T.Adams8, P.Addesso9, R.X.Adhikari1,\nV. B. Adya10, C. Affeldt10, M. Afrough11, B. Agarwal12, M. Agathos13, K. Agatsuma14, N. Aggarwal15, O. D. Aguiar16,\nL. Aiello17,18, A. Ain19, P. Ajith20, B. Allen10,21,22, G. Allen12, A. Allocca23,24, P. A. Altin25, A. Amato26, A. Ananyeva1,\nS. B. Anderson1, W. G. Anderson21, S. V. Angelova27, S. Antier28, S. Appert1, K. Arai1, M. C. Araya1, J. S. Areeda29,\nN. Arnaud28,30, K. G. Arun31, S. Ascenzi32,33, G. Ashton10, M. Ast34, S. M. Aston7, P. Astone35, D. V. Atallah36,\nP. Aufmuth22, C. Aulbert10, K. AultONeal37, C. Austin2, A. Avila-Alvarez29, S. Babak38, P. Bacon39, M. K. M. Bader14,\nS. Bae40, P. T. Baker41, F. Baldaccini42,43, G. Ballardin30, S. W. Ballmer44, S. Banagiri45, J. C. Barayoga1, S. E. Barclay46,\nB.C.Barish1, D.Barker47, K.Barkett48, F.Barone3,4, B.Barr46, L.Barsotti15, M.Barsuglia39, D.Barta49, S.D.Barthelmy50,\nJ.Bartlett47, I.Bartos51,5, R.Bassiri52, A

In [935]:
# removing description of numbers
cutoff = strings[-1].find('(SKA South Africa/MeerKAT)')
strings[-1] = strings[-1][:cutoff]
last_name = strings[-1][-13:]
print(last_name)

 A. Woudt953



In [936]:
#remove header and page numbers

for i, string in enumerate(strings):
    if i == len(strings):
        break
    else:
        strings[i] = string[:-2] # remove number at end of page assuming it is always 2 characters long (e.g. "1\n", "2\n", etc.)
    
    cutoff =string.find('Abbottetal.\n')
    if cutoff == -1:
        print(f"Header not found in page {i+1}")
    else:
        strings[i] = strings[i][cutoff + len('Abbottetal.\n'):]


    print(cutoff)




Header not found in page 1
-1
59
59
59
59
59
59
59
59
59
59


In [937]:

# pattern_1 = "("
# pattern_2 = ")"
# removed_counter = 0
# for i in range(len(strings)):
#     if pattern_1 in strings[i] and pattern_2 in strings[i]:
#         print(f"Found parentheses in page {i+1}")
#     start = 0
#     stop = 0
#     indicies_start = []
#     indicies_stop = []


#     while True:
#         index_1 = strings[i].find(pattern_1, start)
#         index_2 = strings[i].find(pattern_2, stop)
#         if index_1 == -1 or index_2 == -1:
#             break
#         indicies_start.append(index_1)
#         indicies_stop.append(index_2)
#         start = index_1 + 1
#         stop = index_2 + 1
#         strings[i] = strings[i].replace(strings[i][index_1:index_2+1], "")
#         removed_counter += 1

# print(f"Removed {removed_counter} instances of text between parentheses")

In [938]:
# Remove all parentheses and content (including nested)
removed_counter = 0
for i in range(len(strings)):
    original = strings[i]
    # Keep removing parentheses until none remain
    while '(' in strings[i]:
        strings[i] = re.sub(r'\([^()]*\)', '', strings[i])
    removed_counter += len(original) - len(strings[i])

print(f"Removed {removed_counter} characters (parentheses and content)")

Removed 1399 characters (parentheses and content)


In [939]:
strings[3]

'M.Yazback5, HangYu15, HaocunYu15, M.Yvert8, A.Zadrożny132, M.Zanolin37, T.Zelenova30, J.-P.Zendri55, M.Zevin89,\nL. Zhang1, M. Zhang140, T. Zhang46, Y.-H. Zhang58, C. Zhao65, M. Zhou89, Z. Zhou89, S. J. Zhu38,10, X. J. Zhu6,\nA. B. Zimmerman90, M. E. Zucker1,15, J. Zweizig1,\n,\nC. A. Wilson-Hodge137, E. Bissaldi161,162, L. Blackburn163,15, M. S. Briggs164, E. Burns50, W. H. Cleveland165,\nV. Connaughton165, M. H. Gibby166, M. M Giles166, A. Goldstein165, R. Hamburg164, P. Jenke164, C. M. Hui137,\nR. M. Kippen167, D. Kocevski137, S. McBreen168, C. A. Meegan164, W. S. Paciesas165, S. Poolakkil164, R. D. Preece164,\nJ. Racusin50, O. J. Roberts165, M. Stanbro164, P. Veres164, A. von Kienlin169,\n,\nV.Savchenko170, C.Ferrigno170, E.Kuulkers171, A.Bazzano172, E.Bozzo170, S.Brandt173, J.Chenevez173, T.J.-L.Courvoisier170,\nR. Diehl169, A. Domingo174, L. Hanlon168, E. Jourdain175, P. Laurent176,177, F. Lebrun176, A. Lutovinov178,179,\nA. Martin-Carrillo168, S. Mereghetti180, L. Natalucci172,

In [940]:
#remove newlines
for i in range(len(strings)):
    strings[i] = strings[i].replace("\n", "")


In [941]:
# split into list of names
names = []
for string in strings:
    names.extend(string.split(","))

In [942]:
print (len(names))
names = [re.sub(r'\d+$', '', name) for name in names]
names = [name.strip() for name in names]
names = [name for name in names if name]  # Remove empty strings
print(len(names))


4209
3614


In [943]:
print("Total names:", len(names))
print("\nPotential issues:")



# Check for suspicious entries
suspicious = [name for name in names if len(name) < 3 or name.isdigit()]
print(f"Suspicious entries (too short/only digits): {suspicious[:10]}")

empty_names = [name for name in names if not name or name.isspace()]
print(f"Empty or whitespace-only names: {len(empty_names)}")

# Check for names containing "and"


# Show first 10 and last 10 names
print(f"\nFirst 10 names: {names[:10]}")
print(f"Last 10 names: {names[-10:]}")

Total names: 3614

Potential issues:
Suspicious entries (too short/only digits): []
Empty or whitespace-only names: 0

First 10 names: ['B.P.Abbott', 'R.Abbott', 'T.D.Abbott', 'F.Acernese', 'K.Ackley', 'C.Adams', 'T.Adams', 'P.Addesso', 'R.X.Adhikari', 'V. B. Adya']
Last 10 names: ['O. Korobkin', 'R. T. Wollaeger', 'andF. Camilo', 'A. R. Foley', 'S. Goedhart', 'S. Makhathini', 'N. Oozeer', 'O. M. Smirnov', 'R. P. Fender', 'and P. A. Woudt']


In [944]:

names = [re.sub(r'^and\s*(?=[A-Z])', '', name).strip() for name in names]

names = [re.sub(r'\s+and$', '', name, flags=re.IGNORECASE).strip() for name in names]

names = [name for name in names if name]



print(f"After cleaning: {len(names)} names")
print(f"Last 10 names: {names[-10:]}")

After cleaning: 3614 names
Last 10 names: ['O. Korobkin', 'R. T. Wollaeger', 'F. Camilo', 'A. R. Foley', 'S. Goedhart', 'S. Makhathini', 'N. Oozeer', 'O. M. Smirnov', 'R. P. Fender', 'P. A. Woudt']


In [945]:
#removing spaces between initials and lastname:
names = [re.sub(r' ', '', name) for name in names]

In [946]:
for i, name in enumerate(names):
    
    split_index = name.rfind('.')
    if split_index != -1:
  
        last_name_start = split_index + 1
        
        
        initials = name[:split_index + 1]
        
   
        last_name = name[split_index + 1:].strip()
        
        
        if last_name:  # Only rearrange if there's a last name
            names[i] = last_name + ' ' + initials

In [947]:
names.sort()

print(f"Total names: {len(names)}")
print(f"First 10 names: {names[:10]}")
print(f"Last 10 names: {names[-10:]}")

Total names: 3614
First 10 names: ['ABeardsley', 'Aab A.', 'Aartsen M.G.', 'Abbott B.P.', 'Abbott R.', 'Abbott T.D.', 'Abbott T.M.C.', 'Abdalla H.', 'Abe F.', 'Abeysekara A.U.']
Last 10 names: ['vandenBrand J.F.J.', 'vanderHorst A.J.', 'vanderSchaaf L.', 'vanderWalt D.J.', 'vonKienlin A.', 'Álvarez J.D.', 'Šmída R.', 'Šupík J.', 'Żarnecki A.F.', 'Żywucka N.']


In [948]:
# Check for suspicious entries
print("Checking for errors...\n")

# Names with numbers or special characters
suspicious_chars = [name for name in names if any(c.isdigit() or c in ':;' for c in name)]
print(f"Names containing digits or special chars (':' or ';'): {len(suspicious_chars)}")
print(f"Examples: {suspicious_chars[:10]}\n")

# Names that are too short (likely errors)
too_short = [name for name in names if len(name) < 3]
print(f"Names shorter than 3 characters: {len(too_short)}")
print(f"Examples: {too_short[:20]}\n")

# Names without a space (should have "LastName Initials")
no_space = [name for name in names if ' ' not in name]
print(f"Names without space: {len(no_space)}")
print(f"Examples: {no_space}\n")

# Names without a dot (should have initials with dots)
no_dot = [name for name in names if '.' not in name]
print(f"Names without dots (missing initials): {len(no_dot)}")
print(f"Examples: {no_dot[:15]}")

Checking for errors...

Names containing digits or special chars (':' or ';'): 0
Examples: []

Names shorter than 3 characters: 0
Examples: []

Names without space: 16
Examples: ['ABeardsley', 'AbhirupGhosh', 'AlessandroNavarrini', 'ArchismanGhosh', 'ArunavaMukherjee', 'BobJacobs', 'CarloMigoni', 'ChungleeKim', 'HMiyasaka', 'HangYu', 'HaocunYu', 'HowardPan', 'Huang-WeiPan', 'M.ConstancioJr.', 'R.E.RyanJr.', 'SabrinaMilia']

Names without dots (missing initials): 14
Examples: ['ABeardsley', 'AbhirupGhosh', 'AlessandroNavarrini', 'ArchismanGhosh', 'ArunavaMukherjee', 'BobJacobs', 'CarloMigoni', 'ChungleeKim', 'HMiyasaka', 'HangYu', 'HaocunYu', 'HowardPan', 'Huang-WeiPan', 'SabrinaMilia']


In [949]:
names = [name for name in names if name not in no_dot]

print(f"Total names after removing no_dot entries: {len(names)}")
print(f"First 10 names: {names[:10]}")
print(f"Last 10 names: {names[-10:]}")

Total names after removing no_dot entries: 3600
First 10 names: ['Aab A.', 'Aartsen M.G.', 'Abbott B.P.', 'Abbott R.', 'Abbott T.D.', 'Abbott T.M.C.', 'Abdalla H.', 'Abe F.', 'Abeysekara A.U.', 'Abramo L.R.']
Last 10 names: ['vandenBrand J.F.J.', 'vanderHorst A.J.', 'vanderSchaaf L.', 'vanderWalt D.J.', 'vonKienlin A.', 'Álvarez J.D.', 'Šmída R.', 'Šupík J.', 'Żarnecki A.F.', 'Żywucka N.']


In [950]:
for i, name in enumerate(no_dot):
    # Find all positions of capital letters
    capitals = [j for j, c in enumerate(name) if c.isupper()]
    
    if len(capitals) >= 2:
        # Split at the second capital letter
        split_pos = capitals[1]
        first_name = name[:split_pos]
        last_name = name[split_pos:]
        no_dot[i] = last_name + ' ' + first_name + '.'


In [951]:
names.extend(no_dot)

# Sort the combined list
names.sort()

print(f"Total names after merge: {len(names)}")
print(f"First 10 names: {names[:10]}")
print(f"Last 10 names: {names[-10:]}")


Total names after merge: 3614
First 10 names: ['Aab A.', 'Aartsen M.G.', 'Abbott B.P.', 'Abbott R.', 'Abbott T.D.', 'Abbott T.M.C.', 'Abdalla H.', 'Abe F.', 'Abeysekara A.U.', 'Abramo L.R.']
Last 10 names: ['vandenBrand J.F.J.', 'vanderHorst A.J.', 'vanderSchaaf L.', 'vanderWalt D.J.', 'vonKienlin A.', 'Álvarez J.D.', 'Šmída R.', 'Šupík J.', 'Żarnecki A.F.', 'Żywucka N.']


In [952]:
names.sort()

print(f"Total names: {len(names)}")
print(f"First 10 names: {names[:10]}")
print(f"Last 10 names: {names[-10:]}")
no_space = [name for name in names if ' ' not in name]
print(f"Names without space: {len(no_space)}")
print(f"Examples: {no_space}\n")

Total names: 3614
First 10 names: ['Aab A.', 'Aartsen M.G.', 'Abbott B.P.', 'Abbott R.', 'Abbott T.D.', 'Abbott T.M.C.', 'Abdalla H.', 'Abe F.', 'Abeysekara A.U.', 'Abramo L.R.']
Last 10 names: ['vandenBrand J.F.J.', 'vanderHorst A.J.', 'vanderSchaaf L.', 'vanderWalt D.J.', 'vonKienlin A.', 'Álvarez J.D.', 'Šmída R.', 'Šupík J.', 'Żarnecki A.F.', 'Żywucka N.']
Names without space: 2
Examples: ['M.ConstancioJr.', 'R.E.RyanJr.']



In [953]:
names = [name for name in names if name not in no_space]

print(f"Total names after removing no_space entries: {len(names)}")
print(f"First 10 names: {names[:10]}")
print(f"Last 10 names: {names[-10:]}")

Total names after removing no_space entries: 3612
First 10 names: ['Aab A.', 'Aartsen M.G.', 'Abbott B.P.', 'Abbott R.', 'Abbott T.D.', 'Abbott T.M.C.', 'Abdalla H.', 'Abe F.', 'Abeysekara A.U.', 'Abramo L.R.']
Last 10 names: ['vandenBrand J.F.J.', 'vanderHorst A.J.', 'vanderSchaaf L.', 'vanderWalt D.J.', 'vonKienlin A.', 'Álvarez J.D.', 'Šmída R.', 'Šupík J.', 'Żarnecki A.F.', 'Żywucka N.']


In [954]:
for i, name in enumerate(no_space):
    if name == 'M.ConstancioJr.':
        no_space[i] = 'Constancio Jr. M.'
    if name == 'R.E.RyanJr.':
        no_space[i] = 'Ryan Jr. R.E.'



In [960]:
names.extend(no_space)

# Sort the combined list
names.sort()

print(f"Total names after merge: {len(names)}")
print(f"First 10 names: {names[10:100]}")


Total names after merge: 3616
First 10 names: ['Abramowski A.', 'Abreu P.', 'Acernese F.', 'Acero F.', 'Ackermann M.', 'Ackley K.', 'Ackley K.', 'Adams C.', 'Adams J.', 'Adams S.M.', 'Adams T.', 'Addesso P.', 'Adhikari R.X.', 'Adya V.B.', 'Affeldt C.', 'Afrough M.', 'Agarwal B.', 'Agathos M.', 'Agatsuma K.', 'Aggarwal N.', 'Aglietta M.', 'Agliozzo C.', 'Agudo I.', 'Aguiar O.D.', 'Aguilar J.A.', 'Aharonian F.', 'Ahlers M.', 'Ahrens M.', 'Aiello L.', 'Ain A.', 'AitBenkhali F.', 'Ajith P.', 'Akras S.', 'AlSamarai I.', 'Albert A.', 'Albert A.', 'Albuquerque I.F.M.', 'Albury J.M.', 'Alcaniz J.S.', 'Alexander K.D.', 'Alfaro R.', 'Allam S.', 'Allekotte I.', 'Allen B.', 'Allen G.', 'Allison J.', 'Allison J.R.', 'Allocca A.', 'Almela A.', 'Altin P.A.', 'Altmann D.', 'Alvarez C.', 'Alvarez-Muñiz J.', 'AlvarezCastillo J.', 'Amati L.', 'Amato A.', 'An T.', 'Ananyeva A.', 'Anastasi G.A.', 'Anchordoqui L.', 'Andeen K.', 'Anderson J.P.', 'Anderson S.B.', 'Anderson T.', 'Anderson W.G.', 'Andrada B.', 

In [959]:
# find middle name

index = int((len(names))/2)
print(index)
print(names[index-1])

1807
Lippert M.
