In [28]:
import pandas as pd
from unidecode import unidecode

In [29]:
# this script creates the full database for chosen crops
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
rename_cols = {
    'Local': 'name',
    'Área destinada à colheita (Hectares)': 'area_planted',
    'Área plantada (Hectares)': 'area_planted',
    'Área colhida (Hectares)': 'area_harvested',
    'Quantidade produzida (Toneladas)': 'production',
    'Rendimento médio da produção (Quilogramas por Hectare)': 'yield',
    'UF': 'state',
    'Tipo região': 'region_type'
}

start_cols = ['name', 'state', 'region_type']

data_pam = pd.DataFrame(columns=start_cols)

# for filename in filenames:
#     df_temp = pd.read_excel('data/processed/' + filename + '.xlsx').rename(columns=rename_cols)
#     df_temp['crop'] = filename
#     df = pd.concat([df, df_temp])

for crop in crops:

    rename_cols = {
        'Local': 'name',
        'Área destinada à colheita (Hectares)': 'area_planted_' + crop,
        'Área plantada (Hectares)': 'area_planted_' + crop,
        'Área colhida (Hectares)': 'area_harvested_' + crop,
        'Quantidade produzida (Toneladas)': 'production_' + crop,
        'Rendimento médio da produção (Quilogramas por Hectare)': 'yield_' + crop,
        'UF': 'state',
        'Tipo região': 'region_type',
        'Valor da produção (Mil Reais)': 'value_' + crop,
    }

    df_temp = pd.read_excel('data/processed/' + crop + '.xlsx').rename(columns=rename_cols)
    data_pam = data_pam.merge(df_temp, on=['name', 'state', 'region_type'], how='outer')



In [30]:
# this script creates the database only with production columns, more focused in my work.
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
start_cols = ['name', 'state', 'location_type']
data_pam = pd.DataFrame(columns=start_cols)

for crop in crops:

    rename_cols = {
        'Local': 'name',
        'Quantidade produzida (Toneladas)': crop,
        'UF': 'state',
        'Tipo região': 'location_type',
    }

    df_temp = pd.read_excel('data/processed/' + crop + '.xlsx').rename(columns=rename_cols)
    df_temp = df_temp[rename_cols.values()].drop_duplicates()
    data_pam = data_pam.merge(df_temp, on=['name', 'state', 'location_type'], how='outer')
    
# cleaning strings up

data_pam['name'] = data_pam['name'].apply(unidecode)
data_pam['state'] = data_pam['state'].apply(unidecode)
data_pam['location_type'] = (data_pam['location_type'].apply(unidecode).str
                       .replace('Municipio', 'city')
                       .replace('Microrregiao', 'microregion')
                       .replace('Mesorregiao', 'macroregion')
                       .replace('UF', 'state'))
# putting all productions int64o the same df


data_pam.to_csv('data/processed/data_pam_2022.csv')

In [31]:

rename_cols = {
    'Nome município': 'name',
    'Código IBGE município': 'id_city',
    'Nome microrregião': 'microregion',
    'Código IBGE microrregião': 'id_microregion',
    'Nome mesorregião': 'macroregion',
    'Código IBGE mesorregião': 'id_macroregion',
    'Nome UF': 'state',
    'Código IBGE UF': 'id_state',
    'Sigla UF': 'UF',
    'Nome região': 'region'
}
data_ibge_city = pd.read_excel('data/raw/ibge_codes/municipios.xlsx').rename(columns=rename_cols)
data_ibge_city['location_type'] = 'city'
data_ibge_city['id'] = data_ibge_city['id_city'].astype('int64')
str_columns = ['name', 'microregion','macroregion','state','UF','region']
for col in str_columns:
    data_ibge_city[col] = data_ibge_city[col].apply(unidecode)
    
data_ibge_microregion = pd.read_excel('data/raw/ibge_codes/microrregiao.xlsx').rename(columns=rename_cols)
data_ibge_microregion['name'] = data_ibge_microregion['microregion']
data_ibge_microregion['location_type'] = 'microregion'
data_ibge_microregion['id'] = data_ibge_microregion['id_microregion'].astype('int64')
str_columns = ['name', 'microregion', 'macroregion','state','UF','region']

for col in str_columns:
    data_ibge_microregion[col] = data_ibge_microregion[col].apply(unidecode)

data_ibge_macroregion = pd.read_excel('data/raw/ibge_codes/mesorregiao.xlsx').rename(columns=rename_cols)
data_ibge_macroregion['name'] = data_ibge_macroregion['macroregion']
data_ibge_macroregion['location_type'] = 'macroregion'
data_ibge_macroregion['id'] = data_ibge_macroregion['id_macroregion'].astype('int64')
str_columns = ['name','state', 'macroregion', 'UF','region']

for col in str_columns:
    data_ibge_macroregion[col] = data_ibge_macroregion[col].apply(unidecode)

data_ibge_state = pd.read_excel('data/raw/ibge_codes/uf.xlsx').rename(columns=rename_cols)
data_ibge_state['name'] = data_ibge_state['state']
data_ibge_state['location_type'] = 'state'
data_ibge_state['id'] = data_ibge_state['id_state'].astype('int64')
str_columns = ['name', 'state', 'UF','region']

for col in str_columns:
    data_ibge_state[col] = data_ibge_state[col].apply(unidecode)

data_ibge = pd.concat([data_ibge_city, data_ibge_microregion, data_ibge_macroregion, data_ibge_state])

data_ibge.to_csv('data/processed/data_ibge.csv')



In [32]:
df = data_pam.merge(data_ibge, on=['name', 'location_type', 'state'], how='inner')
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
df['total'] = df[crops].sum(axis=1)
df = df.set_index('id')


In [33]:
# for each microregion, finding out what is the city with maximum production. that will be the reference city for distance purposes.
import numpy as np
microregions = df['microregion'].unique()

df.loc[df['location_type'] == 'city', 'location_id'] = df.loc[df['location_type'] == 'city'].index

for microregion in microregions:
    if microregion is not np.nan:
        df.loc[(df['microregion'] == microregion) & (df['location_type'] == 'microregion'), 'location_id'] = df['total'].loc[(df['location_type'] == 'city') & (df['microregion'] == microregion)].idxmax()

macroregions = df['macroregion'].unique()

for macroregion in macroregions:
    if macroregion is not np.nan:
        df.loc[(df['macroregion'] == macroregion) & (df['location_type'] == 'macroregion'), 'location_id'] = df['total'].loc[(df['location_type'] == 'city') & (df['macroregion'] == macroregion)].idxmax()

states = df['state'].unique()
for state in states:
    if state is not np.nan:
        df.loc[(df['state'] == state) & (df['location_type'] == 'state'), 'location_id'] = df['total'].loc[(df['location_type'] == 'city') & (df['state'] == state)].idxmax()

df['location_id'] = df['location_id'].astype(int)



In [34]:
df.to_csv('data/processed/location_db.csv')

In [35]:
df.loc[df['location_type'] == 'state']

Unnamed: 0_level_0,name,state,location_type,cocoa,coffee,corn,rice,soy,sugarcane,id_city,microregion,id_microregion,macroregion,id_macroregion,id_state,UF,region,total,location_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
12,Acre,Acre,state,0,2570,135276,4605,22667,11896,,,,,,12,AC,Norte,177014,1200385
27,Alagoas,Alagoas,state,0,0,48055,16913,11242,17050772,,,,,,27,AL,Nordeste,17126982,2702306
16,Amapa,Amapa,state,0,0,2350,890,18035,6997,,,,,,16,AP,Norte,28272,1600303
13,Amazonas,Amazonas,state,656,628,5593,2963,13740,37239,,,,,,13,AM,Norte,60819,1303536
29,Bahia,Bahia,state,109748,233325,2461097,669,6074022,4688240,,,,,,29,BA,Nordeste,13567101,2911105
23,Ceara,Ceara,state,11,448,538505,17116,7740,522601,,,,,,23,CE,Nordeste,1086421,2312304
53,Distrito Federal,Distrito Federal,state,0,1205,327000,0,303120,27092,,,,,,53,DF,Centro-Oeste,658417,5300108
32,Espirito Santo,Espirito Santo,state,11703,950823,52537,373,200,3108481,,,,,,32,ES,Sudeste,4124117,3203205
52,Goias,Goias,state,0,16957,10709893,107769,15216144,73525072,,,,,,52,GO,Centro-Oeste,99575835,5218508
21,Maranhao,Maranhao,state,0,0,2278917,181197,3537377,2827703,,,,,,21,MA,Nordeste,8825194,2111607


In [36]:
df.loc[1400159]

name                           Bonfim
state                         Roraima
location_type                    city
cocoa                               0
coffee                              0
corn                            47600
rice                            50050
soy                             79137
sugarcane                         875
id_city                     1400159.0
microregion       Nordeste de Roraima
id_microregion                14002.0
macroregion          Norte de Roraima
id_macroregion                 1401.0
id_state                           14
UF                                 RR
region                          Norte
total                          177662
location_id                   1400159
Name: 1400159, dtype: object

In [37]:
distance_matrix = pd.read_csv('data/raw/distance_matrix/matriz_distancias.csv', index_col=['origem', 'destino']).drop(columns='tempo')['distancia'].unstack()
distance_matrix.index.name = 'origin'
distance_matrix.to_csv('data/processed/distance_matrix.csv')



In [38]:
data_sp = df.loc[(df['state'] == 'Sao Paulo') & (df['location_type'] == 'city')]
code_list = data_sp['location_id'].to_numpy()
code_list = (code_list / 10).astype(int)
new_dm = distance_matrix.loc[code_list, code_list]