### Import Pandas


`Pandas` is a powerful data analysis and manipulation tool, offering easy-to-use data structures and analysis tools for Python

In [2]:
import pandas as pd

### Define the file path

In [5]:
file_path = r"C:\Users\claud\Desktop\data_preparation_using_python\exercise_03_how_to_create_a_calendar_dimension\exercise_03.xlsx"

### Load the Excel File

In [6]:
# Load the Excel file
try:
    with pd.ExcelFile(file_path) as excel_file:

        # Get the sheet names
        sheet_names = excel_file.sheet_names
        print("Sheet Names in the Excel File:" , sheet_names)

        # Create a dictionary to store the DataFrames
        dataframes = {}

        # Iterate over the sheet names and convert each one into a DataFrame
        for sheet in sheet_names:
            dataframes[sheet] = excel_file.parse(sheet)

except FileNotFoundError:
    print (f"Error: The file at path {file_path} was not found")
except ImportError as e:
    print(f"Error {e}")
    print("Please install the missing dependency by running: pip install openpyxl or Pandas")
except Exception as e:
    print(f"Error reading the Excel File {e}")    

Sheet Names in the Excel File: ['dim_customer', 'dim_employee', 'dim_employee_sales_territory', 'dim_geography', 'dim_reseller', 'dim_product', 'dim_sales_territory', 'fact_reseller_sales']


### Merge

In [7]:
dataframes['fact_reseller_sales'].head(1)

Unnamed: 0,id_order,Order date,Due date,Ship date,id_product,id_reseller,id_employee,id_sales_territory,Order Quantity
0,SO43897,2017-08-25,2017-09-04,2017-09-01,235,312,282,4,2


In [9]:
fact_sales = dataframes['fact_reseller_sales'].copy()

In [16]:

# Primero, convierte la columna de fecha a tipo datetime si aún no lo está
fact_sales['Order date'] = pd.to_datetime(fact_sales['Order date'])

# Encuentra la fecha mínima y máxima en tu DataFrame
fecha_min = fact_sales['Order date'].min()
fecha_max = fact_sales['Order date'].max()

# Obtener el año mínimo y máximo
anio_min = fecha_min.year
anio_max = fecha_max.year

# Crear un DataFrame para tu dimensión calendario
calendario = pd.DataFrame(columns=['fecha'])

# Generar un rango continuo entre el 1 de enero del año mínimo y el 31 de diciembre del año máximo
calendario['fecha'] = pd.date_range(start=pd.Timestamp(anio_min, 1, 1), end=pd.Timestamp(anio_max, 12, 31))

# Agrega las columnas de año, mes y día
calendario['anio'] = calendario['fecha'].dt.year
calendario['mes'] = calendario['fecha'].dt.month
calendario['dia'] = calendario['fecha'].dt.day

# Imprime el DataFrame resultante
print(calendario)


          fecha  anio  mes  dia
0    2017-01-01  2017    1    1
1    2017-01-02  2017    1    2
2    2017-01-03  2017    1    3
3    2017-01-04  2017    1    4
4    2017-01-05  2017    1    5
...         ...   ...  ...  ...
1456 2020-12-27  2020   12   27
1457 2020-12-28  2020   12   28
1458 2020-12-29  2020   12   29
1459 2020-12-30  2020   12   30
1460 2020-12-31  2020   12   31

[1461 rows x 4 columns]
