In [1]:
import html
from html.parser import HTMLParser
from html.entities import name2codepoint
from pylatexenc.latex2text import LatexNodes2Text
from rapidfuzz import process, fuzz
import threading
import pprint

In [2]:
# Parse through source.txt and make a dictionary with key as name and value as number of appearances
# The names are unsanitised(containing HTML and Latex symbols).
# Similar names are taken distinct here.
# After the dictionary is made, the names are written in author_names.txt file

# Execution time: 3.3 seconds
names = {}
with open('source.txt', 'r') as f:
    for line in f:
        line = line.strip()
        if(line[0:2] == '#@'):
            line = line.lower()
            line_names = line[2:].split(',')
            for name in line_names:
                name = name.strip()
                if name != '' and name[-1] == '\n':
                    name = name[:-1]
                name = name.strip()
                if name == '':
                    continue
                if name in names:
                    names[name] += 1
                else:
                    names[name] = 1

with open('author_names.txt', 'w') as f:
    for key in sorted(names.keys()):
        f.write(key+'\n')

In [3]:
# Execution time: 184 seconds

# Sanitises the names present in the above generated author_names.txt file.
#
# ...............Write more info..................
#
# Creates a dictionary with key as sanitised_name and value as number of appearances.
# Writes the sanitised names in author_names_sanitised.txt file

# Very slow. One solution searching for \,$ gave like 30 results. So sanitise them only
sanitised_names = {}

def raw_string_c_(og_name):
    modified_name = og_name
    modified_name = modified_name.replace('\W', 'W')
    # modified_name = modified_name.replace("\''a", "\\\'a")
    modified_name = modified_name.replace("\a", "\\a")
    modified_name = modified_name.replace("\v", "\\v")
    modified_name = modified_name.replace("\b", "\\b")
    modified_name = modified_name.replace("\"", '\\"')
    modified_name = modified_name.replace("\''", '\\\'')
    return modified_name

def name_corrector(og_name):
    name_modified1 = og_name
    name_modified2 = raw_string_c_(og_name)
    name_modified1 = name_modified1.lower()
    name_modified1 = html.unescape(name_modified1)
    name_modified2 = LatexNodes2Text().latex_to_text(name_modified2)
    if len(name_modified1) == len(name_modified2):
        final_name = og_name
    elif len(name_modified1) > len(name_modified2):
        final_name = name_modified2
    else:
        final_name = name_modified1
    return final_name

for key in names:
    sanitised_names[key] = name_corrector(key)

sorted_sanitised_names = sorted(list(sanitised_names.keys()))

In [4]:
assert(len(sanitised_names) == len(names))

In [5]:
s = {}
for key in sorted_sanitised_names:
    if len(key) == 0:
        print('Here') # It never reached here. No key is empty in sanitised_names.
        continue
    if key[0] in s:
        s[key[0]] += 1
    else:
        s[key[0]] = 1

pprint.pprint(s)

sum1 = 0
for key in sorted(s.keys()):
    if key <= 'a':
        sum1 += s[key]

sum2 = 0
for key in sorted(s.keys()):
    if key >= 'z':
        sum2 += s[key]

lengths = []
for i in range(1,27):
    if i == 1:
        lengths.append(sum1)
    elif i == 26:
        lengths.append(sum2)
    else:
        lengths.append(s[chr(i+96)])
lengths.insert(0, 0)
cu_lengths = [sum(lengths[0:x+1]) for x in range(0, len(lengths)+1)]

{'"': 3,
 '&': 11,
 "'": 2,
 '(': 4,
 '.': 16,
 '0': 3,
 '1': 2,
 '2': 2,
 '3': 1,
 '?': 1,
 '_': 1,
 '`': 1,
 'a': 45609,
 'b': 19813,
 'c': 32069,
 'd': 33980,
 'e': 17535,
 'f': 14813,
 'g': 23889,
 'h': 24248,
 'i': 8526,
 'j': 61027,
 'k': 23683,
 'l': 20898,
 'm': 56300,
 'n': 13545,
 'o': 4743,
 'p': 27264,
 'q': 1841,
 'r': 36822,
 's': 45204,
 't': 24452,
 'u': 2389,
 'v': 10113,
 'w': 15562,
 'x': 5368,
 'y': 15585,
 'z': 6456,
 '\x8a': 1,
 'à': 10,
 'á': 147,
 'â': 2,
 'ä': 1,
 'å': 24,
 'æ': 1,
 'ç': 15,
 'è': 2,
 'é': 84,
 'ê': 1,
 'ì': 1,
 'í': 19,
 'ï': 2,
 'ò': 2,
 'ó': 22,
 'ö': 91,
 'ø': 37,
 'ú': 1,
 'ü': 12,
 'ć': 1,
 'č': 4,
 'ī': 1,
 'ľ': 1,
 'ł': 26,
 'ş': 25,
 'š': 24,
 'ž': 11}


In [6]:
print(lengths)
print(cu_lengths)
assert(sum(lengths) == len(sanitised_names)) # character division is proper
assert(cu_lengths[-1] == len(sanitised_names))

[0, 45656, 19813, 32069, 33980, 17535, 14813, 23889, 24248, 8526, 61027, 23683, 20898, 56300, 13545, 4743, 27264, 1841, 36822, 45204, 24452, 2389, 10113, 15562, 5368, 15585, 7024]
[0, 45656, 65469, 97538, 131518, 149053, 163866, 187755, 212003, 220529, 281556, 305239, 326137, 382437, 395982, 400725, 427989, 429830, 466652, 511856, 536308, 538697, 548810, 564372, 569740, 585325, 592349, 592349]


In [23]:
# This will group the names according to their similarity in fuzzy matching(95 percent threshold chosen).
# group_names dictionary has key as the name and value as a list of names that are similar to it as values

count = 0
lock = threading.Lock() 
name_dicts = [None]*26
for i in range(0, 26):
    name_dicts[i] = {'grouped_names': {}}
''' Takes indices of keys of sanitised_names, groups them according to their similarity and writes them in
grouped_names dictionary.
'''
def group_names(thread_id, start_idx, end_idx):
    global count
    for idx in range(start_idx, end_idx+1):
        key = sorted_sanitised_names[idx]
        if len(name_dicts[thread_id]['grouped_names']) == 0:
            name_dicts[thread_id]['grouped_names'][key] = []
            continue
        match = process.extractOne(key, name_dicts[thread_id]['grouped_names'].keys(), scorer=fuzz.token_sort_ratio)
        if match[1] > 90:
            name_dicts[thread_id]['grouped_names'][match[0]].append(key)
        else:
            name_dicts[thread_id]['grouped_names'][key] = []
        lock.acquire()
        count+=1
        if count % 3000 == 0:
            print(f"Grouping progress: {count/len(names)*100.0}% done. Total: {len(names)}, Remaining: {len(names)-count}")
        lock.release()
    

In [24]:
count = 0
threads = []
for i in range(0,26):
    threads.append(threading.Thread(target=group_names, args=(i, cu_lengths[i], cu_lengths[i+1]-1)))

for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

count2 = 0
with open(f'author_names_grouped.txt', 'w') as f:
    for i in range(0,26):
        for key in  name_dicts[i]['grouped_names'].keys():
            f.write(f"{count2}% {key}\n") # using % because its not present in author_names_grouped.txt
            for name in name_dicts[i]['grouped_names'][key]:
                f.write(f"{count2}% {name}\n")
            count2+=1

Grouping progress: 0.5064581859680695% done. Total: 592349, Remaining: 589349
Grouping progress: 1.012916371936139% done. Total: 592349, Remaining: 586349
Grouping progress: 1.5193745579042084% done. Total: 592349, Remaining: 583349
Grouping progress: 2.025832743872278% done. Total: 592349, Remaining: 580349
Grouping progress: 2.5322909298403475% done. Total: 592349, Remaining: 577349
Grouping progress: 3.038749115808417% done. Total: 592349, Remaining: 574349
Grouping progress: 3.5452073017764865% done. Total: 592349, Remaining: 571349
Grouping progress: 4.051665487744556% done. Total: 592349, Remaining: 568349


KeyboardInterrupt: 

Grouping progress: 4.558123673712625% done. Total: 592349, Remaining: 565349
Grouping progress: 5.064581859680695% done. Total: 592349, Remaining: 562349
Grouping progress: 5.571040045648765% done. Total: 592349, Remaining: 559349
Grouping progress: 6.077498231616834% done. Total: 592349, Remaining: 556349
Grouping progress: 6.583956417584903% done. Total: 592349, Remaining: 553349
Grouping progress: 7.090414603552973% done. Total: 592349, Remaining: 550349
Grouping progress: 7.596872789521042% done. Total: 592349, Remaining: 547349
Grouping progress: 8.103330975489111% done. Total: 592349, Remaining: 544349


In [17]:
sanitised_to_id = {}
with open('author_names_grouped.txt', 'r') as f:
    for line in f:
        line = line[0:-1]
        a = line.split('% ')
        # if(a[1] == "carlos d'andrea"):
        #     print("Here2")
        if a[1] in sanitised_to_id:
            print('Here')
            print(a[1])
        sanitised_to_id[a[1]] = a[0]

In [18]:
print(len(sanitised_to_id), len(sanitised_names))

592349 592349


In [19]:
extras = {}
sentinel = 600000
check1 = 0
check2 = 0
with open('source.txt', 'r') as f1, open('source_clean.txt', 'w') as f2:
    for line in f1:
        # line = line.strip()
        if(line[0:2] == '#@'):
            if line[0:-1] == "#@": # This is the case when paper has no author.
                f2.write(line)
                continue
            line = line.lower()
            line_names = line[2:].split(',')
            f2.write('#@')
            for name in line_names:
                name = name.strip()
                if name != '' and name[-1] == '\n':
                    name = name[:-1]
                name = name.strip()
                if name == '':
                    continue
                if name in sanitised_names.keys():
                    if sanitised_names[name] in sanitised_to_id.keys():
                        f2.write(f"{sanitised_to_id[sanitised_names[name]]},")
                    else:
                        # print('Here')
                        check1+=1
                        # print(f"{sanitised_names[name]}")
                        extras[sanitised_names[name]] = sentinel
                        f2.write(f"{sentinel},")
                        sentinel+=1
                else:
                    # print('Here')
                    check2+=1
            f2.write('\n')
        else:
            f2.write(line)              

In [20]:
print(check1, check2)

571 0


In [21]:
# Check if replacements to source_clean are error free or not by asserting authors replaced by integers
with open('source_clean.txt', 'r') as f:
    for line in f:
        if(line[0:2] == '#@'):
            if line[0:-1] == "#@": # This is the case when paper has no author.
                continue
            line = line[0:-1]
            line_numbers = line[2:].split(',')
            for number in line_numbers:
                number = number.strip()
                if number == '':
                    continue
                if(not number.isnumeric()):
                    print(f"{number} is not a number")
        

In [22]:
with open('extras.txt', 'w') as f:
    for key in extras.keys():
        f.write(f"{extras[key]}% {key}\n")