In [1]:
import os
from sqlalchemy import create_engine
from sqlalchemy import inspect
import pandas as pd


In [2]:
host = os.getenv('HOST')
database = os.getenv('SQL_DATABASE')
user = os.getenv('SQL_USER')
password = os.getenv('SQL_PASSWORD')

In [3]:
connection_string = f"postgresql://{user}:{password}@{host}/{database}"

In [4]:
engine = create_engine(connection_string)

In [5]:
insp = inspect(engine)
insp.get_table_names()

['product', 'store_cities', 'sales']

In [6]:
product_df = pd.read_sql(
    'SELECT * FROM product', engine
)
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_id      699 non-null    object 
 1   product_length  681 non-null    float64
 2   product_depth   683 non-null    float64
 3   product_width   683 non-null    float64
 4   cluster_id      649 non-null    object 
 5   hierarchy1_id   699 non-null    object 
 6   hierarchy2_id   699 non-null    object 
 7   hierarchy3_id   699 non-null    object 
 8   hierarchy4_id   699 non-null    object 
 9   hierarchy5_id   699 non-null    object 
dtypes: float64(3), object(7)
memory usage: 54.7+ KB


In [7]:
store_cities_df = pd.read_sql(
    'SELECT * FROM store_cities', engine
)
store_cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   store_id      144 non-null    object 
 1   storetype_id  144 non-null    object 
 2   store_size    144 non-null    float64
 3   city_id       144 non-null    object 
dtypes: float64(1), object(3)
memory usage: 4.6+ KB


In [8]:
sales_df = pd.read_sql(
    'SELECT * FROM sales', engine
)
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19454838 entries, 0 to 19454837
Data columns (total 13 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   product_id             object 
 1   store_id               object 
 2   date                   object 
 3   sales                  float64
 4   revenue                float64
 5   stock                  float64
 6   price                  float64
 7   promo_type_1           object 
 8   promo_bin_1            object 
 9   promo_type_2           object 
 10  promo_bin_2            object 
 11  promo_discount_2       object 
 12  promo_discount_type_2  object 
dtypes: float64(4), object(9)
memory usage: 1.9+ GB


In [9]:
import pickle
filename = 'sales_df_sqlalchemy.pickle'
with open(filename, 'wb') as f:
    pickle.dump(sales_df, f)

In [10]:
del sales_df

In [12]:
filename = 'sales_df_sqlalchemy.pickle'
with open(filename, 'rb') as f:
    loaded_sales_df = pickle.load(f)

In [13]:
loaded_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19454838 entries, 0 to 19454837
Data columns (total 13 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   product_id             object 
 1   store_id               object 
 2   date                   object 
 3   sales                  float64
 4   revenue                float64
 5   stock                  float64
 6   price                  float64
 7   promo_type_1           object 
 8   promo_bin_1            object 
 9   promo_type_2           object 
 10  promo_bin_2            object 
 11  promo_discount_2       object 
 12  promo_discount_type_2  object 
dtypes: float64(4), object(9)
memory usage: 1.9+ GB
