In [None]:
import os
import glob
import json
import utils
import pandas as pd

pd.set_option('display.max_columns', None)

In [None]:
print('⚙️ Importing productions...')

path = os.path.join('datasets/producao_intelectual/', "producoes-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

df = pd.concat((pd.read_csv(f, encoding='iso8859_1', delimiter=";") for f in data_files))

# Only selects author 5% sample of the dataset
# df = df.sample(frac=0.05, random_state=1)

print("   {} productions in the dataset".format(len(df)))
df

In [None]:
# Filter fields of interest
df = df.filter([
  'NM_PRODUCAO',
  'NM_TIPO_PRODUCAO',
  'NM_SUBTIPO_PRODUCAO',
  'AN_BASE',
  'SG_ENTIDADE_ENSINO',
  'NM_PROGRAMA_IES',
  'NM_AREA_CONCENTRACAO',
  'NM_LINHA_PESQUISA',
  'NM_PROJETO',
  'ID_ADD_PRODUCAO_INTELECTUAL'
])

df

In [None]:
print('⚙️ Normalizing production titles...')
df['NM_PRODUCAO'] = df['NM_PRODUCAO'].apply(utils.normalize_title)

In [None]:
df_grouped = df.groupby(['NM_PRODUCAO'], sort=False, as_index=False).agg({
  'NM_TIPO_PRODUCAO': 'first',
  'NM_SUBTIPO_PRODUCAO': 'first',
  'AN_BASE': 'first',
  'SG_ENTIDADE_ENSINO': utils.count_to_dict,
  'NM_PROGRAMA_IES': utils.count_to_dict,
  'NM_AREA_CONCENTRACAO': utils.count_to_dict,
  'NM_LINHA_PESQUISA': utils.count_to_dict,
  'NM_PROJETO': utils.count_to_dict,
  'ID_ADD_PRODUCAO_INTELECTUAL': list,
})
df_grouped

In [None]:
print('⚙️ Exporting production id replacements list to output/prod_id_replacements.json...')
os.makedirs('output/', exist_ok=True)
replacements = {}

for _, id_list in df_grouped['ID_ADD_PRODUCAO_INTELECTUAL'].items():
  for id in id_list[1:]:
    replacements[id] = id_list[0]

json.dump(replacements, open('output/prod_id_replacements.json', 'w'))

In [None]:
print('⚙️ Importing productions details...')

path = os.path.join('datasets/detalhes_producao/', "detalhes-prod-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

details = {}

for f in data_files:
  details.update(pd.read_csv(f, encoding='iso8859_1', delimiter=";").set_index('ID_ADD_PRODUCAO_INTELECTUAL').to_dict('index_names'))

df_grouped['ID_ADD_PRODUCAO_INTELECTUAL'] = df_grouped['ID_ADD_PRODUCAO_INTELECTUAL'].map(lambda l: l[0])
df_grouped['DETALHES'] = df_grouped['ID_ADD_PRODUCAO_INTELECTUAL'].map(details)


In [None]:
print('⚙️ Exporting productions to output/processed_productions.csv...')
df_grouped.index.name = 'IDX'
df_grouped.to_csv('output/processed_productions.csv', sep=';')