In [None]:
import utils
import pandas as pd
import os, glob, multiprocessing
from collections import defaultdict
from joblib import Parallel, delayed

pd.set_option('display.max_columns', None)

In [None]:

print('⚙️ Importing authors...')

path = os.path.join('datasets/autores/', "autores-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

df = pd.concat(pd.read_csv(f, encoding='iso8859_1', delimiter=";") for f in data_files)

# Only selects author 1% sample of the dataset
# df = df.sample(frac=0.01, random_state=1)

print("   {} authors in the dataset".format(len(df)))

In [None]:
# Filter fields of interest
df = df.filter([
  'NM_AUTOR',
  'NM_ABNT_AUTOR',
  'TP_AUTOR',
  'NM_TP_CATEGORIA_DOCENTE',
  'NM_NIVEL_DISCENTE',
  'CD_PROGRAMA_IES',
  'NM_PROGRAMA_IES',
  'NM_AREA_CONHECIMENTO',
  'SG_ENTIDADE_ENSINO',
  'ID_PESSOA_DISCENTE',
  'ID_PESSOA_DOCENTE',
  'ID_PESSOA_PART_EXTERNO',
  'ID_PESSOA_POS_DOC',
  'ID_PESSOA_EGRESSO',
  'ID_ADD_PRODUCAO_INTELECTUAL',
])

# Unify IDs
def unify_ids(cols):
    return {
      'DOCENTE': cols['ID_PESSOA_DOCENTE'],
      'EGRESSO': cols['ID_PESSOA_EGRESSO'],
      'PÓS-DOC': cols['ID_PESSOA_POS_DOC'],
      'DISCENTE': cols['ID_PESSOA_DISCENTE'],
      'PARTICIPANTE EXTERNO': cols['ID_PESSOA_PART_EXTERNO'],
      '-': None,
    }[cols['TP_AUTOR']]

ids = [
  'ID_PESSOA_DISCENTE',
  'ID_PESSOA_DOCENTE',
  'ID_PESSOA_PART_EXTERNO',
  'ID_PESSOA_POS_DOC',
  'ID_PESSOA_EGRESSO',
]

print('⚙️ Unifying author IDs...')
df['ID'] = df[['TP_AUTOR', *ids]].apply(unify_ids, axis=1)
df = df.drop(columns=ids)

In [None]:
print('⚙️ Normalizing df names...')
df['NM_AUTOR'] = df['NM_AUTOR'].apply(utils.normalize_name)

In [None]:
print('⚙️ Creating helper columns...')
df['FIRST_LAST_NAME'] = df['NM_AUTOR'].apply(utils.firstAndLastName)
df['FULL_NAME'] = df['NM_AUTOR']

In [None]:
merge_schema = {
  'ID': 'min',
  'FULL_NAME': 'first',
  'FIRST_LAST_NAME': 'first',
  'NM_AUTOR': utils.count_to_dict,
  'NM_ABNT_AUTOR': utils.count_to_dict,
  'TP_AUTOR': utils.count_to_dict,
  'NM_TP_CATEGORIA_DOCENTE': utils.count_to_dict,
  'NM_NIVEL_DISCENTE': utils.count_to_dict,
  'CD_PROGRAMA_IES': utils.count_to_dict,
  'NM_PROGRAMA_IES': utils.count_to_dict,
  'NM_AREA_CONHECIMENTO': utils.count_to_dict,
  'SG_ENTIDADE_ENSINO': utils.count_to_dict,
  'ID_ADD_PRODUCAO_INTELECTUAL': list,
}

In [None]:
# Merge df by ID
print('⚙️ Merging authors by ID...')
merged_authors = df.groupby(['ID'], sort=False, as_index=False).agg(merge_schema)
print("   {} authors with ID after merge".format(len(merged_authors)))

In [None]:
# Get authors without an ID
authors_without_id = df[df['ID'].isnull()]
print("   {} authors without IDs".format(len(authors_without_id)))

In [None]:
# Group authors without ID by name
print('⚙️ Merging authors without ID by name...')
authors_without_id = authors_without_id.groupby(['FULL_NAME'], sort=False, as_index=False).agg(merge_schema)
print("   {} authors without IDs after merge".format(len(authors_without_id)))

In [None]:
# Concatenate authors without ID with merged authors
all_authors = pd.concat([merged_authors, authors_without_id])

In [None]:
# Merge all authors by name
print('⚙️ Merging all {} authors by name...'.format(len(all_authors)))
merge_schema['ID_ADD_PRODUCAO_INTELECTUAL'] = sum
merged_authors = all_authors.groupby(['FULL_NAME'], sort=False, as_index=False).agg(merge_schema)
print("   {} authors after merge".format(len(merged_authors)))

In [None]:
## Export preliminary authors dataset
# This dataset includes all the authors merged by ID and Full Name only
# and can be generated much faster when compared to the complete dataset
utils.export_authors_dataframe(merged_authors, 'processed_authors_preliminary.csv')

In [None]:
merged_authors.reset_index()

In [None]:
# Get all the authors without an ID
authors_without_id = merged_authors[merged_authors['ID'].isnull()]

# Generate IDs for authors without ID
authors_without_id['ID'] = authors_without_id.index + 1

In [None]:
# Get all authors with IDs
authors_with_id = merged_authors[merged_authors['ID'].notnull()]

In [None]:
# Merging orphan authors
merge_count = 0
append_count = 0

final_authors = authors_with_id

def process_null_author(idx_na):
  global merged_authors, final_authors, merge_count, append_count
  
  try:
    orphan = authors_without_id.iloc[idx_na]
    last_name = orphan['FULL_NAME'].split(' ')[-1]
    potential_authors = final_authors[final_authors['FULL_NAME'].str.contains(last_name, na=False)]
    
    for idx_pot in range(len(potential_authors)):
      author = potential_authors.iloc[idx_pot]
      
      if utils.compare_authors(author, orphan) >= 5:
        print("   🔄 Merging authors ({})'{}' to ({})'{}'".format(idx_na, orphan['FULL_NAME'], idx_pot, author['FULL_NAME']))
        merged = utils.merge_authors(author, orphan, merge_schema)
        final_authors.loc[[author.name]] = pd.DataFrame(merged)
        merge_count = merge_count + 1
        return
    print("   ➕ Appending author ({})'{}'".format(idx_na, orphan['NM_AUTOR']))
    final_authors = pd.concat([final_authors, orphan], ignore_index=True)
    append_count = append_count + 1
  except Exception as e:
    print("   Error processing author ({})'{}', '{}': {}  -- skipping".format(idx_na, orphan['NM_AUTOR'], orphan['NM_ABNT_AUTOR'], e))
    pass

num_cores = multiprocessing.cpu_count()

print('⚙️ Merging {} authors without IDs (using {} cores):'.format(len(authors_without_id), num_cores))

# Parallel processing
Parallel(n_jobs=num_cores, require='sharedmem')(delayed(process_null_author)(i) for i in range(len(authors_without_id)))

print("   {} authors were merged and {} were appended to the dataset.".format(merge_count, append_count))

In [None]:
## Export complete authors dataset
utils.export_authors_dataframe(final_authors, 'processed_authors_complete.csv')