In [29]:
import pandas as pd
from unidecode import unidecode

In [30]:
# this script creates the database only with production columns, more focused in my work.
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
start_cols = ['name', 'state', 'location_type']
data_pam = pd.DataFrame(columns=start_cols)

for crop in crops:

    rename_cols = {
        'Local': 'name',
        'Quantidade produzida (Toneladas)': crop,
        'Área destinada à colheita (Hectares)' : 'area_planted_' + crop,
        'Área plantada (Hectares)': 'area_planted_' + crop,
        'Área colhida (Hectares)': 'area_harvested_' + crop,
        'UF': 'state',
        'Tipo região': 'location_type',
    }

    new_cols = list(set(rename_cols.values()))

    df_temp = pd.read_excel('data/processed/' + crop + '.xlsx').rename(columns=rename_cols)
    df_temp = df_temp[new_cols].drop_duplicates()
    data_pam = data_pam.merge(df_temp, on=['name', 'state', 'location_type'], how='outer')
    
# cleaning strings up

data_pam['name'] = data_pam['name'].apply(unidecode)
data_pam['state'] = data_pam['state'].apply(unidecode)
data_pam['location_type'] = (data_pam['location_type'].apply(unidecode).str
                       .replace('Municipio', 'city')
                       .replace('Microrregiao', 'microregion')
                       .replace('Mesorregiao', 'macroregion')
                       .replace('UF', 'state'))
# putting all productions int64o the same df


data_pam.to_csv('data/processed/data_pam_2022.csv')

In [31]:

rename_cols = {
    'Nome município': 'name',
    'Código IBGE município': 'id_city',
    'Nome microrregião': 'microregion',
    'Código IBGE microrregião': 'id_microregion',
    'Nome mesorregião': 'macroregion',
    'Código IBGE mesorregião': 'id_macroregion',
    'Nome UF': 'state',
    'Código IBGE UF': 'id_state',
    'Sigla UF': 'UF',
    'Nome região': 'region'
}
data_ibge_city = pd.read_excel('data/raw/ibge_codes/municipios.xlsx').rename(columns=rename_cols)
data_ibge_city['location_type'] = 'city'
data_ibge_city['id'] = data_ibge_city['id_city'].astype('int64')
str_columns = ['name', 'microregion','macroregion','state','UF','region']
for col in str_columns:
    data_ibge_city[col] = data_ibge_city[col].apply(unidecode)
    
data_ibge_microregion = pd.read_excel('data/raw/ibge_codes/microrregiao.xlsx').rename(columns=rename_cols)
data_ibge_microregion['name'] = data_ibge_microregion['microregion']
data_ibge_microregion['location_type'] = 'microregion'
data_ibge_microregion['id'] = data_ibge_microregion['id_microregion'].astype('int64')
str_columns = ['name', 'microregion', 'macroregion','state','UF','region']

for col in str_columns:
    data_ibge_microregion[col] = data_ibge_microregion[col].apply(unidecode)

data_ibge_macroregion = pd.read_excel('data/raw/ibge_codes/mesorregiao.xlsx').rename(columns=rename_cols)
data_ibge_macroregion['name'] = data_ibge_macroregion['macroregion']
data_ibge_macroregion['location_type'] = 'macroregion'
data_ibge_macroregion['id'] = data_ibge_macroregion['id_macroregion'].astype('int64')
str_columns = ['name','state', 'macroregion', 'UF','region']

for col in str_columns:
    data_ibge_macroregion[col] = data_ibge_macroregion[col].apply(unidecode)

data_ibge_state = pd.read_excel('data/raw/ibge_codes/uf.xlsx').rename(columns=rename_cols)
data_ibge_state['name'] = data_ibge_state['state']
data_ibge_state['location_type'] = 'state'
data_ibge_state['id'] = data_ibge_state['id_state'].astype('int64')
str_columns = ['name', 'state', 'UF','region']

for col in str_columns:
    data_ibge_state[col] = data_ibge_state[col].apply(unidecode)

data_ibge = pd.concat([data_ibge_city, data_ibge_microregion, data_ibge_macroregion, data_ibge_state])

data_ibge.to_csv('data/processed/data_ibge.csv')



In [32]:
df = data_pam.merge(data_ibge, on=['name', 'location_type', 'state'], how='inner')
crops = ['cocoa', 'coffee', 'corn', 'rice', 'soy', 'sugarcane']
df['total'] = df[crops].sum(axis=1)
df = df.set_index('id')


In [33]:
# finding out urea consumption of each region. At first we'll be considering only the selected crops.
# source - IFA report 2018
kgN_kgurea = 0.46

N_consumption = {
    'rice': 83,
    'corn': 68,
    'soy': 16,
    'sugarcane': 76,
    'coffee': 161,
    'cocoa' : 49
}

df['urea_consumption'] = 0
for crop in crops:
    df['urea_consumption'] = df['urea_consumption'] + N_consumption[crop] * df['area_planted_' + crop] / kgN_kgurea / 1000


In [34]:
# for each microregion, finding out what is the city with maximum production. that will be the reference city for distance purposes.
import numpy as np
microregions = df['microregion'].unique()

df.loc[df['location_type'] == 'city', 'location_id'] = df.loc[df['location_type'] == 'city'].index

for microregion in microregions:
    if microregion is not np.nan:
        df.loc[(df['microregion'] == microregion) & (df['location_type'] == 'microregion'), 'location_id'] = df['total'].loc[(df['location_type'] == 'city') & (df['microregion'] == microregion)].idxmax()

macroregions = df['macroregion'].unique()

for macroregion in macroregions:
    if macroregion is not np.nan:
        df.loc[(df['macroregion'] == macroregion) & (df['location_type'] == 'macroregion'), 'location_id'] = df['total'].loc[(df['location_type'] == 'city') & (df['macroregion'] == macroregion)].idxmax()

states = df['state'].unique()
for state in states:
    if state is not np.nan:
        df.loc[(df['state'] == state) & (df['location_type'] == 'state'), 'location_id'] = df['total'].loc[(df['location_type'] == 'city') & (df['state'] == state)].idxmax()

df['location_id'] = df['location_id'].astype(int)



In [35]:
df.to_csv('data/processed/location_db.csv')

In [36]:
df.loc[df['location_type'] == 'state']

Unnamed: 0_level_0,name,state,location_type,area_planted_cocoa,area_harvested_cocoa,cocoa,area_planted_coffee,area_harvested_coffee,coffee,corn,...,microregion,id_microregion,macroregion,id_macroregion,id_state,UF,region,total,urea_consumption,location_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,Acre,Acre,state,0,0,0,1062,999,2570,135276,...,,,,,12,AC,Norte,177014,7329.08,1200385
27,Alagoas,Alagoas,state,0,0,0,0,0,0,48055,...,,,,,27,AL,Nordeste,17126982,54943.38,2702306
16,Amapa,Amapa,state,0,0,0,0,0,0,2350,...,,,,,16,AP,Norte,28272,799.0152,1600303
13,Amazonas,Amazonas,state,1214,1203,656,320,313,628,5593,...,,,,,13,AM,Norte,60819,1144.289,1303536
29,Bahia,Bahia,state,410785,410185,109748,121534,121534,233325,2461097,...,,,,,29,BA,Nordeste,13567101,252126.7,2911105
23,Ceara,Ceara,state,7,7,11,1302,1302,448,538505,...,,,,,23,CE,Nordeste,1086421,89233.16,2312304
53,Distrito Federal,Distrito Federal,state,0,0,0,418,418,1205,327000,...,,,,,53,DF,Centro-Oeste,658417,11991.98,5300108
32,Espirito Santo,Espirito Santo,state,17488,17488,11703,408681,408646,950823,52537,...,,,,,32,ES,Sudeste,4124117,155937.8,3203205
52,Goias,Goias,state,0,0,0,6771,6771,16957,10709893,...,,,,,52,GO,Centro-Oeste,99575835,591458.7,5218508
21,Maranhao,Maranhao,state,0,0,0,0,0,0,2278917,...,,,,,21,MA,Nordeste,8825194,140284.5,2111607


In [37]:
df.loc[5107925]

name                                     Sorriso
state                                Mato Grosso
location_type                               city
area_planted_cocoa                             0
area_harvested_cocoa                           0
cocoa                                          0
area_planted_coffee                            0
area_harvested_coffee                          0
coffee                                         0
corn                                     3787800
area_planted_corn                         535000
area_harvested_corn                       535000
area_planted_rice                              0
rice                                           0
area_harvested_rice                            0
area_planted_soy                          598500
soy                                      2118690
area_harvested_soy                        598500
area_harvested_sugarcane                    1800
sugarcane                                 126000
area_planted_sugarca

In [40]:
distance_matrix = pd.read_csv('data/raw/distance_matrix/matriz_distancias.csv', index_col=['origem', 'destino']).drop(columns='tempo')['distancia'].unstack()
distance_matrix.index.name = 'origin'

correct_keys = list(df.index)
wrong_keys = (np.array(list(df.index)) / 10).astype(int)
rename_dict = {i: j for i, j in zip (wrong_keys, correct_keys)}
distance_matrix = distance_matrix.rename(index=rename_dict, columns=rename_dict)


In [41]:
distance_matrix.to_csv('data/processed/distance_matrix.csv')

In [39]:
data_sp = df.loc[(df['state'] == 'Sao Paulo') & (df['location_type'] == 'city')]
code_list = data_sp['location_id'].to_numpy()
code_list = (code_list / 10).astype(int)
new_dm = distance_matrix.loc[code_list, code_list]