# Staging and prepare data

![Diagramme](ERD_staged_database.png)

In [5]:
import psycopg2
from dotenv import load_dotenv
import os
import pandas as pd

# Retrieve the Redshift credentials from the .env file
load_dotenv()
redshift_user = os.getenv("redshift_user")
redshift_password = os.getenv("redshift_password")
iam_role = os.getenv("iam_role")

# Connect to Redshift

def execute_redshift(query):
  with psycopg2.connect(
    host='octopus-energy-ops.202533530775.eu-west-3.redshift-serverless.amazonaws.com',
    port=5439,
    database='dev',
    user=redshift_user,
    password=redshift_password
  ) as conn:
     with conn.cursor() as cursor:
      cursor.execute(query)
      conn.commit()
      print('commit to Redshift')
  conn.close()

# Create a schema

In [6]:
query = """
CREATE SCHEMA IF NOT EXISTS staging;
"""

execute_redshift(query)

commit to Redshift


# Staging for Call table

In [10]:
query = """
CREATE TABLE IF NOT EXISTS staging.stg_call (
  call_id BIGINT,
  called_at TIMESTAMP,
  agent_id BIGINT,
  reason_id BIGINT,
  talk_time DOUBLE PRECISION,
  direction VARCHAR,
  account_id BIGINT)
"""

execute_redshift(query)

commit to Redshift


In [11]:
query = """
INSERT INTO
  staging.stg_call
SELECT
  CAST(id as BIGINT) as call_id,
  CAST(called_at as TIMESTAMP) as called_at,
  CAST(agent_id AS BIGINT) as agent_id,
  CAST(CASE 
        WHEN reason_id = 'nan' THEN NULL
        ELSE split_part(reason_id,'.',1)
        END 
  AS BIGINT) as reason_id,
  CAST(talk_time AS DOUBLE PRECISION) as talk_time,
  direction,
  CAST(account_id AS BIGINT) as account_id
FROM
  raw_data.call
"""

execute_redshift(query)

commit to Redshift


# Staging for Reason table

In [13]:
query = """
CREATE TABLE IF NOT EXISTS staging.stg_call_reason (
  call_reason_id BIGINT,
  reason TEXT,
  category TEXT)
"""

execute_redshift(query)

commit to Redshift


In [14]:
query = """
INSERT INTO
  staging.stg_call_reason
SELECT
  id as call_reason_id,
  reason,
  category
FROM
  raw_data.call_reason
"""

execute_redshift(query)

commit to Redshift


# Staging for Account table

In [15]:
query = """
CREATE TABLE IF NOT EXISTS staging.stg_account (
  account_id BIGINT,
  sales_channel TEXT,
  sign_up_date DATE)
"""

execute_redshift(query)

commit to Redshift


In [16]:
query = """
INSERT INTO
  staging.stg_account
SELECT
  id as account_id,
  CASE
    WHEN sales_channel = '' THEN 'No Sales Channel'
    ELSE sales_channel
  END as sales_channel,
  CAST(sign_up_date AS DATE)
FROM
  raw_data.account
"""

execute_redshift(query)

commit to Redshift
