In [None]:
import re, os, glob, unicodedata, multiprocessing
import pandas as pd
from collections import Counter
from joblib import Parallel, delayed


pd.set_option('display.max_columns', None)

In [None]:

print('⚙️ Importing authors...')

path = os.path.join('datasets/autores/', "autores-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

df = pd.concat((pd.read_csv(f, encoding='iso8859_1', delimiter=";") for f in data_files))

# Only selects author 5% sample of the dataset
# df = df.sample(frac=0.05, random_state=1)

print("   {} authors in the dataset".format(len(df)))

In [None]:
# Data Type Optimization
# https://medium.com/bigdatarepublic/advanced-pandas-optimize-speed-and-memory-a654b53be6c2

def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].astype('int64')
    return df

def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df

def optimize_objects(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        if not (type(df[col][0])==list):
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if float(num_unique_values) / num_total_values < 0.5:
                df[col] = df[col].astype('category')
    return df

def optimize(df: pd.DataFrame):
    return optimize_floats(optimize_ints(optimize_objects(df)))

print('⚙️ Optimizing columns data types...')

optimize(df)

df.info()

In [None]:
# Filter fields of interest
df = df.filter([
  'AN_BASE',
  'NM_AUTOR',
  'NM_ABNT_AUTOR',
  'TP_AUTOR',
  'NM_TP_CATEGORIA_DOCENTE',
  'NM_NIVEL_DISCENTE',
  'CD_PROGRAMA_IES',
  'NM_PROGRAMA_IES',
  'NM_AREA_CONHECIMENTO',
  'SG_ENTIDADE_ENSINO',
  'ID_PESSOA_DISCENTE',
  'ID_PESSOA_DOCENTE',
  'ID_PESSOA_PART_EXTERNO',
  'ID_PESSOA_POS_DOC',
  'ID_PESSOA_EGRESSO',
  'ID_ADD_PRODUCAO_INTELECTUAL',
])

# Unify IDs
def unify_ids(cols):
    return {
      'DOCENTE': cols['ID_PESSOA_DOCENTE'],
      'EGRESSO': cols['ID_PESSOA_EGRESSO'],
      'PÓS-DOC': cols['ID_PESSOA_POS_DOC'],
      'DISCENTE': cols['ID_PESSOA_DISCENTE'],
      'PARTICIPANTE EXTERNO': cols['ID_PESSOA_PART_EXTERNO'],
      '-': None,
    }[cols['TP_AUTOR']]

ids = [
  'ID_PESSOA_DISCENTE',
  'ID_PESSOA_DOCENTE',
  'ID_PESSOA_PART_EXTERNO',
  'ID_PESSOA_POS_DOC',
  'ID_PESSOA_EGRESSO',
]

print('⚙️ Unifying author IDs...')

df['ID'] = df[['TP_AUTOR', *ids]].apply(unify_ids, axis=1)
df = df.drop(columns=ids)

In [None]:
# https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string
def strip_accents(text):
  text = unicodedata.normalize('NFD', text)
  text = text.encode('ascii', 'ignore')
  text = text.decode("utf-8")
  return str(text)

def normalize_name(name):
  if ',' not in name:
    norm = name
  else:
    it = name.split(', ')
    it.reverse()
    norm = ' '.join(it)

  # remove accents
  norm = strip_accents(norm)

  # remove invalid chars
  norm = re.sub('[_-]', ' ', norm)
  norm = re.sub('[0-9?&#;()]', '', norm)

  # remove leading and trailing spaces
  norm = norm.strip()

  return norm

print('⚙️ Normalizing df names...')
df['NM_AUTOR'] = df['NM_AUTOR'].apply(normalize_name)

In [None]:
def firstAndLastName(name):
  it = name.split(' ')
  return ' '.join([it[0], it[-1]])

print('⚙️ Creating helper columns...')
df['FIRST_LAST_NAME'] = df['NM_AUTOR'].apply(firstAndLastName)
df['FULL_NAME'] = df['NM_AUTOR']

In [None]:
## Helper Functions
def agg_to_dict(items):
  return items.astype(str).value_counts().to_dict()

def most_frequent(items):
  values = items.dropna()
  if len(values) == 0: return None
  occurence_count = Counter(values)
  return occurence_count.most_common(1)[0][0]

def toArray(item):
  return item if hasattr(item, '__iter__') and not isinstance(item, str) else [item]

def priority(priority_list):
  return lambda items: next((type for type in priority_list if type in items.array), None)

merge_schema = {
  'FULL_NAME': 'first',
  'NM_AUTOR': agg_to_dict,
  'NM_ABNT_AUTOR': agg_to_dict,
  'FIRST_LAST_NAME': agg_to_dict,
  'TP_AUTOR': agg_to_dict,
  'NM_TP_CATEGORIA_DOCENTE': agg_to_dict,
  'NM_NIVEL_DISCENTE': agg_to_dict,
  'CD_PROGRAMA_IES': agg_to_dict,
  'NM_PROGRAMA_IES': agg_to_dict,
  'NM_AREA_CONHECIMENTO': agg_to_dict,
  'SG_ENTIDADE_ENSINO': agg_to_dict,
  'ID_ADD_PRODUCAO_INTELECTUAL': list,
}

# 'NM_AUTOR': most_frequent,
# 'NM_ABNT_AUTOR': most_frequent,
# 'TP_AUTOR': higher_priority(['DOCENTE', 'EGRESSO', 'PÓS-DOC', 'DISCENTE', 'PARTICIPANTE EXTERNO']),
# 'NM_TP_CATEGORIA_DOCENTE': higher_priority(['PERMANENTE', 'COLABORADOR', 'VISITANTE']),
# 'NM_NIVEL_DISCENTE': higher_priority(['DOUTORADO PROFISSIONAL', 'BACHARELADO', 'MESTRADO', 'DOUTORADO', 'MESTRADO PROFISSIONAL', ]),
# 'NM_PROGRAMA_IES': most_frequent,
# 'NM_AREA_CONHECIMENTO': most_frequent,
# 'SG_ENTIDADE_ENSINO': most_frequent,
# 'ID_ADD_PRODUCAO_INTELECTUAL': list,

In [None]:
# Merge df by ID
print('⚙️ Merging authors by ID...')
merged_authors = df.groupby(['ID'], sort=False, as_index=False).agg(merge_schema)
print("   {} authors with ID after merge".format(len(merged_authors)))

merged_authors

In [None]:
# Get all the authors without an ID (orphan authors)
orphan_authors = df[df['ID'].isnull()]
print("   {} authors without IDs".format(len(orphan_authors)))

In [None]:
# Compare two authors and return a value `n` indicating the probability that both authors are the same person
def compare_authors(author, orphan):
  n = 0

  # Exact name match
  if orphan['FULL_NAME'] in author['NM_AUTOR']: n = n + 5;
  # Match first and last name
  if orphan['FIRST_LAST_NAME'] in author['FIRST_LAST_NAME']: n = n + 2;

  # Return if there's no chance of match
  if n == 0: return 0

  # Match abnt name
  if orphan['NM_ABNT_AUTOR'] in author['NM_ABNT_AUTOR']: n = n + 1;
  # Match university
  if orphan['SG_ENTIDADE_ENSINO'] in author['SG_ENTIDADE_ENSINO']: n = n + 1;
  # Match author type
  if orphan['TP_AUTOR'] in author['TP_AUTOR']: n = n + 1;
  # Match IES program
  if orphan['CD_PROGRAMA_IES'] in author['CD_PROGRAMA_IES']: n = n + 1;

  return n

def update_item_count(item_count, value):
  item_count[value] = 1 if value not in item_count else item_count[value] + 1

def merge_authors(author, orphan_author):
  merged = author.copy(deep=True)
  for column in author.index.to_list():
    if column not in merge_schema or column == 'FULL_NAME': continue
    author_value = author[column]
    orphan_value = orphan_author[column]
    if isinstance(author_value, list):
      author_value.append(orphan_value)
    else:
      update_item_count(author_value, orphan_value)

  return merged

# Merging orphan authors
merge_count = 0
append_count = 0

p_authors = merged_authors

def process_null_author(idx_na):
  global merged_authors, p_authors, merge_count, append_count
  
  try:
    orphan = orphan_authors.iloc[idx_na]
    last_name = orphan['FULL_NAME'].split(' ')[-1]
    potential_authors = merged_authors[merged_authors['FULL_NAME'].str.contains(last_name, na=False)]
    
    for idx_pot in range(len(potential_authors)):
      author = potential_authors.iloc[idx_pot]
      
      if compare_authors(author, orphan) >= 5:
        print("   🔄 Merging authors ({})'{}' to ({})'{}'".format(idx_na, orphan['FULL_NAME'], idx_pot, author['FULL_NAME']))
        merged = merge_authors(author, orphan)
        p_authors.loc[[author.name]] = pd.DataFrame(merged)
        merge_count = merge_count + 1
        return
    print("   Appending author ({})'{}'".format(idx_na, orphan['NM_AUTOR']))
    orphan_df = pd.DataFrame(orphan).T.groupby(['NM_AUTOR']).agg(merge_schema)
    p_authors = pd.concat([p_authors, orphan_df], ignore_index=True)
    append_count = append_count + 1
  except:
    print("   Error processing author ({})'{}', '{}'  -- skipping".format(idx_na, orphan['NM_AUTOR'], orphan['NM_ABNT_AUTOR']))
    pass

num_cores = multiprocessing.cpu_count()

print('⚙️ Merging authors without IDs (using {} cores):'.format(num_cores))

# Single thread processing (keep commented)
# for i in range(len(orphan_authors)): process_null_author(i)

# Parallel processing
Parallel(n_jobs=num_cores, require='sharedmem')(delayed(process_null_author)(i) for i in range(len(orphan_authors)))

print("   {} authors were merged and {} were appended to the dataset.".format(merge_count, append_count))

In [None]:
print('⚙️ Exporting authors to processed_authors.csv...')

p_authors.to_csv('processed_authors.csv')

In [None]:
# idx_a = 21335 # 79797, 21335, 171
# idx_na = 2 # 0, 2, 13
# merged_authors.iloc[[idx_a]] = merge_authors(merged_authors.iloc[[idx_a]], orphan_authors.iloc[[idx_na]])

In [None]:
# a = merged_authors.loc[[58693]]
# b = orphan_authors.loc[[1051112]]

# display(a)
# display(b)

# m = merge_authors(a, b)

# display(m)

In [None]:
# merged_authors = pd.concat([merged_authors, orphan_authors], ignore_index=True)
# merged_authors

In [None]:
# merged_authors.to_excel('processed_authors.xlsx')

In [None]:
# def merge_authors(author, orphan_author):
#   merged = author.copy(deep=True)
#   for column in author.index.to_list():
#     print(column)
#     if column not in merge_schema: continue
#     agg_func = merge_schema[column]
#     itemA = toArray(author.get(column).iloc[0])
#     itemB = toArray(orphan_author.get(column).iloc[0])
#     value = agg_func(pd.Series([*itemA, *itemB]))
#     merged[column] = [value]
#   return merged

In [None]:
merged_authors[merged_authors['NM_AUTOR'].str.contains("RECAMONDE", na=False)]