In [None]:
import utils
import pandas as pd
import os, glob, multiprocessing
from joblib import Parallel, delayed

pd.set_option('display.max_columns', None)

In [None]:

print('⚙️ Importing authors...')

path = os.path.join('datasets/autores/', "autores-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

df = pd.concat(pd.read_csv(f, encoding='iso8859_1', delimiter=";") for f in data_files)

# Only selects author 5% sample of the dataset
# df = df.sample(frac=0.05, random_state=1)

print("   {} authors in the dataset".format(len(df)))

In [None]:
# Filter fields of interest
df = df.filter([
  'AN_BASE',
  'NM_AUTOR',
  'NM_ABNT_AUTOR',
  'TP_AUTOR',
  'NM_TP_CATEGORIA_DOCENTE',
  'NM_NIVEL_DISCENTE',
  'CD_PROGRAMA_IES',
  'NM_PROGRAMA_IES',
  'NM_AREA_CONHECIMENTO',
  'SG_ENTIDADE_ENSINO',
  'ID_PESSOA_DISCENTE',
  'ID_PESSOA_DOCENTE',
  'ID_PESSOA_PART_EXTERNO',
  'ID_PESSOA_POS_DOC',
  'ID_PESSOA_EGRESSO',
  'ID_ADD_PRODUCAO_INTELECTUAL',
])

# Unify IDs
def unify_ids(cols):
    return {
      'DOCENTE': cols['ID_PESSOA_DOCENTE'],
      'EGRESSO': cols['ID_PESSOA_EGRESSO'],
      'PÓS-DOC': cols['ID_PESSOA_POS_DOC'],
      'DISCENTE': cols['ID_PESSOA_DISCENTE'],
      'PARTICIPANTE EXTERNO': cols['ID_PESSOA_PART_EXTERNO'],
      '-': None,
    }[cols['TP_AUTOR']]

ids = [
  'ID_PESSOA_DISCENTE',
  'ID_PESSOA_DOCENTE',
  'ID_PESSOA_PART_EXTERNO',
  'ID_PESSOA_POS_DOC',
  'ID_PESSOA_EGRESSO',
]

print('⚙️ Unifying author IDs...')
df['ID'] = df[['TP_AUTOR', *ids]].apply(unify_ids, axis=1)
df = df.drop(columns=ids)

In [None]:
print('⚙️ Normalizing df names...')
df['NM_AUTOR'] = df['NM_AUTOR'].apply(utils.normalize_name)

In [None]:
print('⚙️ Creating helper columns...')
df['FIRST_LAST_NAME'] = df['NM_AUTOR'].apply(utils.firstAndLastName)
df['FULL_NAME'] = df['NM_AUTOR']

In [None]:
merge_schema = {
  'FULL_NAME': 'first',
  'NM_AUTOR': utils.count_to_dict,
  'NM_ABNT_AUTOR': utils.count_to_dict,
  'FIRST_LAST_NAME': utils.count_to_dict,
  'TP_AUTOR': utils.count_to_dict,
  'NM_TP_CATEGORIA_DOCENTE': utils.count_to_dict,
  'NM_NIVEL_DISCENTE': utils.count_to_dict,
  'CD_PROGRAMA_IES': utils.count_to_dict,
  'NM_PROGRAMA_IES': utils.count_to_dict,
  'NM_AREA_CONHECIMENTO': utils.count_to_dict,
  'SG_ENTIDADE_ENSINO': utils.count_to_dict,
  'ID_ADD_PRODUCAO_INTELECTUAL': list,
}

In [None]:
# Merge df by ID
print('⚙️ Merging authors by ID...')
merged_authors = df.groupby(['ID'], sort=False, as_index=False).agg(merge_schema)
print("   {} authors with ID after merge".format(len(merged_authors)))

merged_authors

In [None]:
# Get all the authors without an ID (orphan authors)
orphan_authors = df[df['ID'].isnull()]
print("   {} authors without IDs".format(len(orphan_authors)))

In [None]:
# Compare two authors and return a value `n` indicating the probability that both authors are the same person
def compare_authors(author, orphan):
  n = 0

  # Exact name match
  if orphan['FULL_NAME'] in author['NM_AUTOR']: n = n + 5;
  # Match first and last name
  if orphan['FIRST_LAST_NAME'] in author['FIRST_LAST_NAME']: n = n + 2;

  # Return if there's no chance of match
  if n == 0: return 0

  # Match abnt name
  if orphan['NM_ABNT_AUTOR'] in author['NM_ABNT_AUTOR']: n = n + 1;
  # Match university
  if orphan['SG_ENTIDADE_ENSINO'] in author['SG_ENTIDADE_ENSINO']: n = n + 1;
  # Match author type
  if orphan['TP_AUTOR'] in author['TP_AUTOR']: n = n + 1;
  # Match IES program
  if orphan['CD_PROGRAMA_IES'] in author['CD_PROGRAMA_IES']: n = n + 1;

  return n

def update_item_count(item_count, value):
  item_count[value] = 1 if value not in item_count else item_count[value] + 1

def merge_authors(author, orphan_author):
  merged = author.copy(deep=True)
  for column in author.index.to_list():
    if column not in merge_schema or column == 'FULL_NAME': continue
    author_value = author[column]
    orphan_value = orphan_author[column]
    if isinstance(author_value, list):
      author_value.append(orphan_value)
    else:
      update_item_count(author_value, orphan_value)

  return merged

# Merging orphan authors
merge_count = 0
append_count = 0

p_authors = merged_authors

def process_null_author(idx_na):
  global merged_authors, p_authors, merge_count, append_count
  
  try:
    orphan = orphan_authors.iloc[idx_na]
    last_name = orphan['FULL_NAME'].split(' ')[-1]
    potential_authors = merged_authors[merged_authors['FULL_NAME'].str.contains(last_name, na=False)]
    
    for idx_pot in range(len(potential_authors)):
      author = potential_authors.iloc[idx_pot]
      
      if compare_authors(author, orphan) >= 5:
        print("   🔄 Merging authors ({})'{}' to ({})'{}'".format(idx_na, orphan['FULL_NAME'], idx_pot, author['FULL_NAME']))
        merged = merge_authors(author, orphan)
        p_authors.loc[[author.name]] = pd.DataFrame(merged)
        merge_count = merge_count + 1
        return
    print("   Appending author ({})'{}'".format(idx_na, orphan['NM_AUTOR']))
    orphan_df = pd.DataFrame(orphan).T.groupby(['NM_AUTOR']).agg(merge_schema)
    p_authors = pd.concat([p_authors, orphan_df], ignore_index=True)
    append_count = append_count + 1
  except:
    print("   Error processing author ({})'{}', '{}'  -- skipping".format(idx_na, orphan['NM_AUTOR'], orphan['NM_ABNT_AUTOR']))
    pass

num_cores = multiprocessing.cpu_count()

print('⚙️ Merging authors without IDs (using {} cores):'.format(num_cores))

# Parallel processing
Parallel(n_jobs=num_cores, require='sharedmem')(delayed(process_null_author)(i) for i in range(len(orphan_authors)))

print("   {} authors were merged and {} were appended to the dataset.".format(merge_count, append_count))

In [None]:
print('⚙️ Exporting authors to output/processed_authors.csv...')
p_authors.index.name = 'IDX'
os.makedirs('output/', exist_ok=True)
p_authors.to_csv('output/processed_authors.csv')