In [None]:
import pandas as pd
import math
import ast

pd.set_option('display.max_columns', None)

authors = pd.read_csv("./output/processed_authors.csv", delimiter=",")
authors = authors.drop(columns=['IDX', 'FULL_NAME', 'FIRST_LAST_NAME'])
authors


In [None]:
dict_fields = [
  'NM_AUTOR',
  'NM_ABNT_AUTOR',
  'CD_PROGRAMA_IES',
  'NM_PROGRAMA_IES',
  'NM_AREA_CONHECIMENTO',
  'SG_ENTIDADE_ENSINO',
]

def parse_array(s):
  try:
    lit = ast.literal_eval(s)  
  except:
    return []

  return lit

def most_freq(d):
  try:
    d = ast.literal_eval(d)  
    x = -1
    k = None
    for key in d:
      if (d[key] > x and key != 'nan'):
        x = d[key]
        k = key
  except:
    return '-'

  return k if k != 'nan' else '-'

def select_by_priority(priority_list, d):
  try:
    d = ast.literal_eval(d)
    n = next((type for type in priority_list if type in d.keys()), None)
  except:
    # print(d)
    # print(type(d))
    n = None
  return n 
  
def priority(priority_list):
  return lambda d: select_by_priority(priority_list, d)
  
authors[['ID_ADD_PRODUCAO_INTELECTUAL']] = authors[['ID_ADD_PRODUCAO_INTELECTUAL']].applymap(parse_array)
authors[['TP_AUTOR']] = authors[['TP_AUTOR']].applymap(priority(['DOCENTE', 'EGRESSO', 'PÓS-DOC', 'DISCENTE', 'PARTICIPANTE EXTERNO']))
authors[['NM_TP_CATEGORIA_DOCENTE']] = authors[['NM_TP_CATEGORIA_DOCENTE']].applymap(priority(['PERMANENTE', 'COLABORADOR', 'VISITANTE']))
authors[['NM_NIVEL_DISCENTE']] = authors[['NM_NIVEL_DISCENTE']].applymap(priority(['DOUTORADO PROFISSIONAL', 'BACHARELADO', 'MESTRADO', 'DOUTORADO', 'MESTRADO PROFISSIONAL']))
authors[dict_fields] = authors[dict_fields].applymap(most_freq)

authors

In [None]:
authors = authors.groupby(['NM_AUTOR'], sort=False, as_index=False).agg({
  'ID': 'first',
  'NM_AUTOR': 'first',
  'NM_ABNT_AUTOR': 'first',
  'TP_AUTOR': 'first',
  'NM_TP_CATEGORIA_DOCENTE': 'first',
  'NM_NIVEL_DISCENTE': 'first',
  'CD_PROGRAMA_IES': 'first',
  'NM_PROGRAMA_IES': 'first',
  'NM_AREA_CONHECIMENTO': 'first',
  'SG_ENTIDADE_ENSINO': 'first',
  'ID_ADD_PRODUCAO_INTELECTUAL': 'sum'
})

In [None]:
# Add productions count to each author
authors['PROD_COUNT'] = authors[['ID_ADD_PRODUCAO_INTELECTUAL']].applymap(len)

In [None]:
authors.index.name = 'IDX'
authors.to_csv('output/normalized_authors.csv', sep=';')