In [None]:
import os
import glob
import utils
import pandas as pd

pd.set_option('display.max_columns', None)

In [None]:
print('⚙️ Importing productions...')

path = os.path.join('datasets/producao_intelectual/', "producoes-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

df = pd.concat((pd.read_csv(f, encoding='iso8859_1', delimiter=";") for f in data_files))

# Only selects author 5% sample of the dataset
# df = df.sample(frac=0.05, random_state=1)

print("   {} productions in the dataset".format(len(df)))
df

In [None]:
print('⚙️ Importing productions details...')

path = os.path.join('datasets/detalhes_producao/', "detalhes-prod-*.csv")
data_files = glob.glob(path) 
print(*data_files, sep = "\n")

details = (pd.read_csv(f, encoding='iso8859_1', delimiter=";") for f in data_files)

print('⚙️ Joining productions with production details...')

df_merged = df.copy()

for d in details:
  df_merged = df_merged.join(d, rsuffix='_REMOVE', on='ID_ADD_PRODUCAO_INTELECTUAL', how='outer')

df_merged

In [None]:
df_merged.drop([i for i in df_merged.columns if '_REMOVE' in i],axis=1, inplace=True)
df_merged

In [None]:
print('⚙️ Optimizing columns data types...')
utils.optimize(df_merged)
df_merged.info()

In [None]:
df_grouped = df_merged.groupby(['NM_PRODUCAO'], sort=False, as_index=False).agg({
  'NM_TIPO_PRODUCAO': 'first',
  'NM_SUBTIPO_PRODUCAO': 'first',
  'AN_BASE': 'first',
  'SG_ENTIDADE_ENSINO': utils.agg_to_dict,
  'NM_PROGRAMA_IES': utils.agg_to_dict,
  'NM_AREA_CONCENTRACAO': utils.agg_to_dict,
  'NM_LINHA_PESQUISA': utils.agg_to_dict,
  'NM_PROJETO': utils.agg_to_dict,
  'ID_ADD_PRODUCAO_INTELECTUAL': list,
})
df_grouped

In [None]:
print('⚙️ Exporting productions to output/processed_productions.csv...')

df_grouped.to_csv('output/processed_productions.csv')