# Data visualization

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
import re
import os
from sqlalchemy import create_engine, text
from dotenv import load_dotenv


# Load .env file
load_dotenv(override=True)

# Envorioment variable
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# Create connection
URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(URL)

conn = engine.connect()

try:
    print("Connection Succesfull!!" if conn else "")
    conn.close()

except Exception as e:
    print("Error al conectar la base datos en", e)

# Connection with Data Base PgSql
1. Create connection
2. Show tables
3. Verificate if tables are fill

In [26]:
# 
with engine.connect() as conn:
    # Setting schema
    conn.execute(text("SET search_path TO riwi_ventas"))
    
    # Get table list
    table_list = pd.read_sql(text("SELECT table_name FROM information_schema.tables WHERE table_schema = 'riwi_ventas';"),conn)
    
    msg = f"| Tablas encontradas: {len(table_list)} |"
    print("-"*len(msg))
    print(msg)
    print("-"*len(msg))
    idx = 0
    for table in table_list["table_name"]:
        idx+=1
        print(f"{idx}. {table}")
        
    print("\n-------------------------------------------\n")

    # List existing and filled tables.
    for table in table_list["table_name"]:
        verfy = pd.read_sql(text(f"select * from {table} LIMIT 1000;"),conn)
        if not verfy.empty:
            print(f"Table: {table} exist and is filled")
        else:
            print(f"Table: | {table} | doesn't exist or is empty please check in the Data base.")

    conn.close()

-------------------------
| Tablas encontradas: 6 |
-------------------------
1. tipo_producto
2. producto
3. ciudad
4. factura_ventas
5. tipo_venta
6. tipo_cliente

-------------------------------------------

Table: tipo_producto exist and is filled
Table: producto exist and is filled
Table: ciudad exist and is filled
Table: factura_ventas exist and is filled
Table: tipo_venta exist and is filled
Table: tipo_cliente exist and is filled


## Load an prepare Data

In [None]:
with engine.connect() as conn:
    conn.execute(text("SET search_path TO riwi_ventas"))
    
    # Query to call the table ventas
    
    df_sales = pd.read_sql("SELECT * from factura_ventas;", conn)
    
    list_cols = df_sales.columns.tolist()

    print(f"Data loaded: {len(df_sales):,} sales records")
    print("Aviable columns")
    for tables in list_cols:
        print(df_sales[tables].name,end= ", ")
    print(" ")
    print("\nFirst 5 rows:")
    print(df_sales.head())

## Data cleaning and transformation

In [52]:
# Parse fecha to datetime format
df_sales['fecha'] = pd.to_datetime(df_sales['fecha'])

# Obtain Month and Year
df_sales['ano'] = df_sales['fecha'].dt.year
df_sales['mes'] = df_sales['fecha'].dt.month
df_sales['dia_semana'] = df_sales['fecha'].dt.day_name()

# Calculate Month-Year for agrupations
df_sales['mes_ano'] = df_sales['fecha'].dt.to_period('M')

# Check Data
print("\nInformation about DF")
print(df_sales.info())
print(f"\nDate range: {df_sales['fecha'].min()} a {df_sales['fecha'].max()}")
print(f"Current Year: {sorted(df_sales['ano'].unique())}")


Information about DF
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1047429 entries, 0 to 1047428
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   venta_id         1047429 non-null  int64         
 1   fecha            1047429 non-null  datetime64[ns]
 2   ciudad_id        1047429 non-null  int64         
 3   producto_id      1047429 non-null  int64         
 4   tipo_venta_id    1047429 non-null  int64         
 5   tipo_cliente_id  1047429 non-null  int64         
 6   cantidad         1047429 non-null  float64       
 7   precio_unitario  1047429 non-null  float64       
 8   descuento        1047429 non-null  float64       
 9   costo_envio      1047429 non-null  float64       
 10  total_venta      1047429 non-null  float64       
 11  ano              1047429 non-null  int32         
 12  mes              1047429 non-null  int32         
 13  mes_nombre       1047429 non-null  