In [53]:
import pandas as pd
from unidecode import unidecode

In [54]:
# this script creates the full database for chosen crops
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
rename_cols = {
    'Local': 'name',
    'Área destinada à colheita (Hectares)': 'area_planted',
    'Área plantada (Hectares)': 'area_planted',
    'Área colhida (Hectares)': 'area_harvested',
    'Quantidade produzida (Toneladas)': 'production',
    'Rendimento médio da produção (Quilogramas por Hectare)': 'yield',
    'UF': 'state',
    'Tipo região': 'region_type'
}

start_cols = ['name', 'state', 'region_type']

data_pam = pd.DataFrame(columns=start_cols)

# for filename in filenames:
#     df_temp = pd.read_excel('data/processed/' + filename + '.xlsx').rename(columns=rename_cols)
#     df_temp['crop'] = filename
#     df = pd.concat([df, df_temp])

for crop in crops:

    rename_cols = {
        'Local': 'name',
        'Área destinada à colheita (Hectares)': 'area_planted_' + crop,
        'Área plantada (Hectares)': 'area_planted_' + crop,
        'Área colhida (Hectares)': 'area_harvested_' + crop,
        'Quantidade produzida (Toneladas)': 'production_' + crop,
        'Rendimento médio da produção (Quilogramas por Hectare)': 'yield_' + crop,
        'UF': 'state',
        'Tipo região': 'region_type',
        'Valor da produção (Mil Reais)': 'value_' + crop,
    }

    df_temp = pd.read_excel('data/processed/' + crop + '.xlsx').rename(columns=rename_cols)
    data_pam = data_pam.merge(df_temp, on=['name', 'state', 'region_type'], how='outer')



In [55]:
# this script creates the database only with production columns, more focused in my work.
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
start_cols = ['name', 'state', 'location_type']
data_pam = pd.DataFrame(columns=start_cols)

for crop in crops:

    rename_cols = {
        'Local': 'name',
        'Quantidade produzida (Toneladas)': crop,
        'UF': 'state',
        'Tipo região': 'location_type',
    }

    df_temp = pd.read_excel('data/processed/' + crop + '.xlsx').rename(columns=rename_cols)
    df_temp = df_temp[rename_cols.values()].drop_duplicates()
    data_pam = data_pam.merge(df_temp, on=['name', 'state', 'location_type'], how='outer')
    
# cleaning strings up

data_pam['name'] = data_pam['name'].apply(unidecode)
data_pam['state'] = data_pam['state'].apply(unidecode)
data_pam['location_type'] = (data_pam['location_type'].apply(unidecode).str
                       .replace('Municipio', 'city')
                       .replace('Microrregiao', 'microregion')
                       .replace('Mesorregiao', 'macroregion')
                       .replace('UF', 'state'))
# putting all productions int64o the same df


data_pam.to_csv('data/processed/data_pam_2022.csv')

In [56]:

rename_cols = {
    'Nome município': 'name',
    'Código IBGE município': 'id_city',
    'Nome microrregião': 'microregion',
    'Código IBGE microrregião': 'id_microregion',
    'Nome mesorregião': 'macroregion',
    'Código IBGE mesorregião': 'id_macroregion',
    'Nome UF': 'state',
    'Código IBGE UF': 'id_state',
    'Sigla UF': 'UF',
    'Nome região': 'region'
}
data_ibge_city = pd.read_excel('data/raw/ibge_codes/municipios.xlsx').rename(columns=rename_cols)
data_ibge_city['location_type'] = 'city'
data_ibge_city['id'] = data_ibge_city['id_city'].astype('int64')
str_columns = ['name', 'microregion','macroregion','state','UF','region']
for col in str_columns:
    data_ibge_city[col] = data_ibge_city[col].apply(unidecode)
    
data_ibge_microregion = pd.read_excel('data/raw/ibge_codes/microrregiao.xlsx').rename(columns=rename_cols)
data_ibge_microregion['name'] = data_ibge_microregion['microregion']
data_ibge_microregion['location_type'] = 'microregion'
data_ibge_microregion['id'] = data_ibge_microregion['id_microregion'].astype('int64')
str_columns = ['name', 'microregion', 'macroregion','state','UF','region']

for col in str_columns:
    data_ibge_microregion[col] = data_ibge_microregion[col].apply(unidecode)

data_ibge_macroregion = pd.read_excel('data/raw/ibge_codes/mesorregiao.xlsx').rename(columns=rename_cols)
data_ibge_macroregion['name'] = data_ibge_macroregion['macroregion']
data_ibge_macroregion['location_type'] = 'macroregion'
data_ibge_macroregion['id'] = data_ibge_macroregion['id_macroregion'].astype('int64')
str_columns = ['name','state', 'macroregion', 'UF','region']

for col in str_columns:
    data_ibge_macroregion[col] = data_ibge_macroregion[col].apply(unidecode)

data_ibge_state = pd.read_excel('data/raw/ibge_codes/uf.xlsx').rename(columns=rename_cols)
data_ibge_state['name'] = data_ibge_state['state']
data_ibge_state['location_type'] = 'state'
data_ibge_state['id'] = data_ibge_state['id_state'].astype('int64')
str_columns = ['name', 'state', 'UF','region']

for col in str_columns:
    data_ibge_state[col] = data_ibge_state[col].apply(unidecode)

data_ibge = pd.concat([data_ibge_city, data_ibge_microregion, data_ibge_macroregion, data_ibge_state])

data_ibge.to_csv('data/processed/data_ibge.csv')



In [60]:
df = data_pam.merge(data_ibge, on=['name', 'location_type', 'state'], how='inner')
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
df['total'] = df[crops].sum(axis=1)
df = df.set_index('id')

# id_columns = ['id', 'id_microregion', 'id_macroregion', 'id_state']
# df[id_columns] = df[id_columns].astype('int64')



In [65]:
# for each microregion, finding out what is the city with maximum production. that will be the reference city for distance purposes.

df.loc[df['location_type'] == 'microregion', 'total'] = 0
grouped = df.groupby('microregion')

df['reference_id'] = grouped['total'].transform('idxmax')



# grouped['reference_id'] = df.loc[df['total'].idxmax(), 'id']