In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def list_files(directory):
    files = [f for f in os.listdir(directory)]
    return files

In [3]:
def get_dataframes(directory):
    file_list = list_files(directory)
    sheet_dict = {}
    
    for file_name in file_list:
        path = f"{directory}/{file_name}"
        
        xl = pd.ExcelFile(path)
        for sheet_name in xl.sheet_names:
            df = xl.parse(sheet_name)
            df["UF"] = file_name.replace(".xlsx", "").split("-")[-1].strip()
            try:
                sheet_dict[sheet_name].append(df)
            except KeyError:
                sheet_dict[sheet_name] = [df]
                
    if "Notas" in sheet_dict.keys():
        del sheet_dict["Notas"]
        
    return sheet_dict

In [4]:
def fix_df_structure(df):
    df = df.copy()
    df.iat[3, 0] = "Local"
    df.iat[3, -1] = "UF"
    df.columns = df.iloc[3]
    df = df.iloc[4:-1]
    df = df.reset_index(drop=True)
    return df

In [5]:
def get_region_type(name):
    if type(name) != str:
        print("nome erro:", name)
    if name.startswith("      "):
        return "Município"
    elif name.startswith("    "):
        return "Microrregião"
    elif name.startswith("  "):
        return "Mesorregião"
    else:
        return "UF"

In [6]:
def add_region(row):
    local = row["Local"]
    region_type = get_region_type(local)
    
    row["Local"] = row["Local"].strip()
    row["Tipo região"] = region_type
    return row

In [7]:
def convert_to_number(row):
    numeric_columns = ["Área destinada à colheita (Hectares)",
                       "Área colhida (Hectares)",
                       "Quantidade produzida (Toneladas)",
                       "Rendimento médio da produção (Quilogramas por Hectare)",
                       "Valor da produção (Mil Reais)"
                      ]
    
    for col in numeric_columns:
        try:
            row[col] = float(row[col])
        except ValueError:
            row[col] = 0
    
    return row

In [8]:
def update_df(df):
    df = fix_df_structure(df)
    df = df.apply(add_region, axis=1)
    df = df.apply(convert_to_number, axis=1)
    return df

In [9]:
def join_dataframes(directory):
    sheet_dict = get_dataframes(directory)
    progress = 0
    for sheet_name, df_list in sheet_dict.items():
        progress += 1
        print(100*progress/len(sheet_dict.values()), "%")
        for idx, df in enumerate(df_list):
            df_list[idx] = update_df(df)
            
        sheet_dict[sheet_name] = pd.concat(df_list, ignore_index=True)
    
    return sheet_dict

In [10]:
def save_data(new_directory, sheet_dict):
    for sheet_name, df in sheet_dict.items():
        file_name = sheet_name.split(";")[-1].strip()
        file_name = file_name.replace("*", "")
        path = f"{new_directory}/{file_name}.xlsx"
        df.to_excel(path, index=False)

In [11]:
sheet_dict = join_dataframes("Permanentes_bruto")

2.5641025641025643 %
5.128205128205129 %
7.6923076923076925 %
10.256410256410257 %
12.820512820512821 %
15.384615384615385 %
17.94871794871795 %
20.512820512820515 %
23.076923076923077 %
25.641025641025642 %
28.205128205128204 %
30.76923076923077 %
33.333333333333336 %
35.8974358974359 %
38.46153846153846 %
41.02564102564103 %
43.58974358974359 %
46.15384615384615 %
48.717948717948715 %
51.282051282051285 %
53.84615384615385 %
56.41025641025641 %
58.97435897435897 %
61.53846153846154 %
64.1025641025641 %
66.66666666666667 %
69.23076923076923 %
71.7948717948718 %
74.35897435897436 %
76.92307692307692 %
79.48717948717949 %
82.05128205128206 %
84.61538461538461 %
87.17948717948718 %
89.74358974358974 %
92.3076923076923 %
94.87179487179488 %
97.43589743589743 %
100.0 %


In [12]:
save_data("Permanentes_novo", sheet_dict)