# Explore the data

![Diagramme](ERD_raw_database.png)

In [50]:
# create a simple connector to Redshift and to transform to df

import psycopg2
from dotenv import load_dotenv
import os
import pandas as pd

# Retrieve the Redshift credentials from the .env file
load_dotenv()
redshift_user = os.getenv("redshift_user")
redshift_password = os.getenv("redshift_password")
iam_role = os.getenv("iam_role")

# Connect to Redshift

def get_dataframe(query):
  with psycopg2.connect(
    host='octopus-energy-ops.202533530775.eu-west-3.redshift-serverless.amazonaws.com',
    port=5439,
    database='dev',
    user=redshift_user,
    password=redshift_password
  ) as conn:
     with conn.cursor() as cursor:
        df = pd.read_sql_query(query, conn)
  conn.close()
  return df

## Call Reason table exploration

In [4]:
# No limit because I want to see if there is null values

query = """
SELECT 
  * 
FROM 
  dev.raw_data.call_reason 
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        145 non-null    int64 
 1   reason    145 non-null    object
 2   category  145 non-null    object
dtypes: int64(1), object(2)
memory usage: 3.5+ KB
None


Unnamed: 0,id,reason,category
0,7,Where is my bill,Billing Enquiry
1,74,Refund request,Payment Enquiry
2,15,Ombudsman,Complaint
3,77,"Request for additional service (meter reader, ...",Priority Services
4,5,High Bill,Billing Enquiry
...,...,...,...
140,36,Payment confirmation,Payment Enquiry
141,119,Refund Request,Iresa Enquiry
142,137,Store sales team,M&S Enquiry (use normal call categories 1st)
143,4,General Billing Enquiry,Billing Enquiry


In [11]:
# Primary key is the call_reason_id

query = """
SELECT 
  id
  ,count(*) as nb_rows
FROM 
  dev.raw_data.call_reason
GROUP BY
  id
HAVING
  nb_rows > 1 
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,id,nb_rows


In [5]:
query = """
SELECT 
  distinct(reason) 
FROM 
  dev.raw_data.call_reason 
"""

df = get_dataframe(query)
df.sort_values(by='reason')

  df = pd.read_sql_query(query, conn)


Unnamed: 0,reason
55,Account Balance
81,Add/remove and authorised person
136,Added to PSR
44,Billing
99,Billing Enquiry
...,...
18,Voicemail left
50,Vulnerable Customer
135,Warm Home Discount (WHD)
97,Where is my bill


In [7]:
query = """
SELECT 
  distinct(category) 
FROM 
  dev.raw_data.call_reason 
"""

df = get_dataframe(query)
df.sort_values(by='category')

  df = pd.read_sql_query(query, conn)


Unnamed: 0,category
3,Billing Enquiry
17,Business Supply
24,Call Transfer
11,Complaint
20,Credit Enquiry
22,Faults / Emergency
18,Industry call
16,Iresa Enquiry
23,M&S Enquiry (use normal call categories 1st)
10,Metering


## Account table exploration

In [12]:
# No limit because I want to see if there is null values

query = """
SELECT 
  * 
FROM 
  dev.raw_data.account
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75815 entries, 0 to 75814
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             75815 non-null  int64 
 1   sales_channel  75815 non-null  object
 2   sign_up_date   75815 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB
None


Unnamed: 0,id,sales_channel,sign_up_date
0,461,DIRECT,2016-04-06
1,631,DIRECT,2016-04-12
2,655,DIRECT,2016-04-13
3,1425,PRICE_COMPARISON,2016-04-26
4,1844,DIRECT,2016-04-30
...,...,...,...
75810,933397,NEW_TENANT,2019-04-29
75811,934476,DIRECT,2019-04-29
75812,937868,DIRECT,2019-05-01
75813,950026,NEW_TENANT,2019-05-08


In [13]:
# Primary key is the call_reason_id

query = """
SELECT 
  id
  ,count(*) as nb_rows
FROM 
  dev.raw_data.account
GROUP BY
  id
HAVING
  nb_rows > 1 
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,id,nb_rows


In [15]:
# No null but there is empty string

query = """
SELECT 
  distinct(sales_channel) 
FROM 
  dev.raw_data.account 
"""

df = get_dataframe(query)
df.sort_values(by='sales_channel')

  df = pd.read_sql_query(query, conn)


Unnamed: 0,sales_channel
5,
3,AGGREGATOR
11,BROKER
2,DIGI_TELESALES
1,DIRECT
9,FIELD_SALES
6,NEW_TENANT
7,PARENT_POWER
8,PARTNERSHIPS
10,PRICE_COMPARISON


In [27]:
# There is 5 lines with empty string
# 3 accounts are concerned with differents dates for each of them
# we won't know from which sales channel they come from, but 5 clients over 75000 is not a big deal

query = """
SELECT 
  * 
FROM 
  dev.raw_data.account
WHERE
  sales_channel = ''
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,id,sales_channel,sign_up_date
0,256469,,2018-04-01
1,299671,,2018-06-11
2,121044,,2017-08-24
3,256475,,2018-04-01
4,120971,,2017-08-24


In [21]:
query = """
SELECT 
  MIN(TO_DATE(sign_up_date,'YYYY-MM-DD')) as min_date
  ,MAX(TO_DATE(sign_up_date, 'YYYY-MM-DD')) as max_date 
FROM 
  dev.raw_data.account 
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,min_date,max_date
0,2015-12-21,2019-05-20


## Call table exploration

In [22]:
# No limit because I want to see if there is null values

query = """
SELECT 
  * 
FROM 
  dev.raw_data.call
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          100000 non-null  object 
 1   called_at   100000 non-null  object 
 2   agent_id    100000 non-null  object 
 3   reason_id   100000 non-null  object 
 4   talk_time   100000 non-null  float64
 5   direction   100000 non-null  object 
 6   account_id  100000 non-null  object 
dtypes: float64(1), object(6)
memory usage: 5.3+ MB
None


Unnamed: 0,id,called_at,agent_id,reason_id,talk_time,direction,account_id
0,801411,2019-02-26 12:21:38,533,2.0,274.0,Inbound,453094
1,280014,2018-04-12 14:54:26,167,40.0,1048.0,Inbound,93759
2,599888,2018-11-20 10:27:01,303,123.0,1073.0,Outbound,505219
3,352006,2018-06-26 09:36:28,252,68.0,1705.0,Inbound,257685
4,151631,2017-10-23 10:39:17,8,,33.0,Outbound,36365
...,...,...,...,...,...,...,...
99995,487597,2018-09-24 11:57:09,213,,426.0,Inbound,266779
99996,930050,2019-04-10 15:30:59,562,5.0,472.0,Inbound,311579
99997,527916,2018-10-12 08:07:20,199,79.0,158.0,Inbound,530787
99998,190894,2017-12-29 11:56:46,166,39.0,357.0,Inbound,74196


In [23]:
# Primary key is the call_id

query = """
SELECT 
  id
  ,count(*) as nb_rows
FROM 
  dev.raw_data.call
GROUP BY
  id
HAVING
  nb_rows > 1 
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,id,nb_rows


In [25]:
# only two values

query = """
SELECT 
  distinct(direction)
FROM 
  dev.raw_data.call
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,direction
0,Outbound
1,Inbound


In [26]:
# get an idea of the number of rows for each direction

query = """
SELECT 
  direction
  , count(*) as nb_rows
FROM 
  dev.raw_data.call
GROUP BY
  direction
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,direction,nb_rows
0,Outbound,13933
1,Inbound,86067


In [28]:
# There is a difference between avg and median, we can see that the distribution is not normal
# median is lower than avg, it means that there is some outliers

query = """
SELECT 
  MIN(talk_time) as min_talk_time
  ,MAX(talk_time) as max_talk_time
  ,AVG(talk_time) as avg_talk_time
  ,MEDIAN(talk_time) as median_talk_time
  ,PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY talk_time) as q1_talk_time
  ,PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY talk_time) as q3_talk_time
FROM 
  dev.raw_data.call
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,min_talk_time,max_talk_time,avg_talk_time,median_talk_time,q1_talk_time,q3_talk_time
0,1.0,14389.0,391.96757,299.0,180.0,500.0


In [39]:
# check if there is null values in the column reason_id

query = """
SELECT
  reason_id
FROM 
  dev.raw_data.call
"""

df = get_dataframe(query)
df.sort_values(by='reason_id')

  df = pd.read_sql_query(query, conn)


Unnamed: 0,reason_id
24668,1.0
97219,1.0
36298,1.0
81042,1.0
9775,1.0
...,...
62997,
62996,
62988,
63035,


In [49]:
# check how many null values in the reason_id column

query = """
SELECT
  count(*) as nb_null
FROM 
  dev.raw_data.call
WHERE
  reason_id = 'nan'
"""

df = get_dataframe(query)
df

  df = pd.read_sql_query(query, conn)


Unnamed: 0,nb_null
0,18741


In [52]:
# no problem with the account_id, all calls are related to an account

query = """
SELECT
  account_id
FROM 
  dev.raw_data.call
"""

df = get_dataframe(query)
df.sort_values(by='account_id')

  df = pd.read_sql_query(query, conn)


Unnamed: 0,account_id
3137,100001
45546,100005
98854,100007
83968,100007
20282,100007
...,...
27252,99964
31711,99976
42101,99986
11905,99993


## Cardinality

### First Assumptions

#### Call_reasons

We know that there is 145 uniques call_reasons.
We know that there is 100.000 uniques calls.
We know that some calls have no reason_id.

So we can assume that calls have zero or one call_reason, and call_reason have many calls. 


#### Account

We know that there over 75000 uniques accounts
We know that there is 100.000 uniques calls.
We know there is no null in account_id in the table calls.

So we can assume that calls have one account, and accounts can have many calls.


In [55]:


query = """
WITH casted_call AS (
SELECT
  id
  ,CAST(CASE 
        WHEN reason_id = 'nan' THEN NULL
        ELSE split_part(reason_id,'.',1)
        END 
  AS BIGINT) as reason_id
FROM
  dev.raw_data.call
)
SELECT
  call.id
  ,call.reason_id
  ,reason.id as reason_id
FROM casted_call as call
LEFT JOIN
  dev.raw_data.call_reason as reason
ON
  call.reason_id = reason.id
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         100000 non-null  object 
 1   reason_id  81259 non-null   float64
 2   id         81259 non-null   float64
dtypes: float64(2), object(1)
memory usage: 2.3+ MB
None


Unnamed: 0,id,reason_id,id.1
0,246822,68.0,68.0
1,534802,,
2,808879,34.0,34.0
3,168058,58.0,58.0
4,156603,6.0,6.0
...,...,...,...
99995,317995,79.0,79.0
99996,147783,65.0,65.0
99997,548295,,
99998,175904,30.0,30.0


In [69]:
query = """
WITH casted_call AS (
SELECT
  id
  ,CAST(CASE 
        WHEN reason_id = 'nan' THEN NULL
        ELSE split_part(reason_id,'.',1)
        END 
  AS BIGINT) as reason_id
FROM
  dev.raw_data.call
)
SELECT
  call.id
  ,call.reason_id as call_reason_fk
  ,reason.id as reason_id
  ,reason.reason
FROM casted_call as call
RIGHT JOIN
  dev.raw_data.call_reason as reason
ON
  call.reason_id = reason.id
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81262 entries, 0 to 81261
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              81259 non-null  object 
 1   call_reason_fk  81259 non-null  float64
 2   reason_id       81262 non-null  int64  
 3   reason          81262 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 2.5+ MB
None


Unnamed: 0,id,call_reason_fk,reason_id,reason
0,759201,64.0,64,New meter reading
1,659962,34.0,34,Once off card payment
2,949775,34.0,34,Once off card payment
3,464449,34.0,34,Once off card payment
4,873401,34.0,34,Once off card payment
...,...,...,...,...
81257,93765,58.0,58,Disputed meter reading
81258,265373,58.0,58,Disputed meter reading
81259,348100,58.0,58,Disputed meter reading
81260,545880,64.0,64,New meter reading


In [71]:
df[df['call_reason_fk'].isnull()]

Unnamed: 0,id,call_reason_fk,reason_id,reason
40811,,,146,Meter operator called customer
43107,,,136,SSE customer with no account
78264,,,115,No reply - DD Amended


In [72]:
query = """
SELECT
  id
FROM
  dev.raw_data.call
WHERE
  reason_id IN ('146', '136', '115')
"""

df = get_dataframe(query)
print(df.info())
df

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      0 non-null      object
dtypes: object(1)
memory usage: 0.0+ bytes
None


  df = pd.read_sql_query(query, conn)


Unnamed: 0,id


In [65]:
query = """
WITH casted_call AS (
SELECT
  id
  ,CAST(account_id AS BIGINT) as account_id
FROM
  dev.raw_data.call
)
SELECT
  call.id
  ,call.account_id as call_account_fk
  ,account.id as account_id
FROM casted_call as call
LEFT JOIN
  "dev"."raw_data"."account" as account
ON
  call.account_id = account.id
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               100000 non-null  object
 1   call_account_fk  100000 non-null  int64 
 2   account_id       100000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.3+ MB
None


Unnamed: 0,id,call_account_fk,account_id
0,764339,308862,308862
1,877298,59957,59957
2,941076,787761,787761
3,184907,141215,141215
4,838320,241166,241166
...,...,...,...
99995,345786,162925,162925
99996,900054,872950,872950
99997,797239,597554,597554
99998,137685,31431,31431


In [67]:
query = """
WITH casted_call AS (
SELECT
  id
  ,CAST(account_id AS BIGINT) as account_id
FROM
  dev.raw_data.call
)
SELECT
  call.id
  ,call.account_id as call_account_fk
  ,account.id as account_id
FROM casted_call as call
RIGHT JOIN
  "dev"."raw_data"."account" as account
ON
  call.account_id = account.id
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               100000 non-null  object
 1   call_account_fk  100000 non-null  int64 
 2   account_id       100000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.3+ MB
None


Unnamed: 0,id,call_account_fk,account_id
0,866758,825444,825444
1,491620,508517,508517
2,170506,74860,74860
3,592099,338882,338882
4,1025560,966846,966846
...,...,...,...
99995,212124,172583,172583
99996,847344,392757,392757
99997,499308,425561,425561
99998,864326,12186,12186


In [68]:
query = """
WITH casted_call AS (
SELECT
  id
  ,CAST(account_id AS BIGINT) as account_id
FROM
  dev.raw_data.call
)
SELECT
  call.id
  ,call.account_id as call_account_fk
  ,account.id as account_id
FROM casted_call as call
INNER JOIN
  "dev"."raw_data"."account" as account
ON
  call.account_id = account.id
"""

df = get_dataframe(query)
print(df.info())
df

  df = pd.read_sql_query(query, conn)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               100000 non-null  object
 1   call_account_fk  100000 non-null  int64 
 2   account_id       100000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.3+ MB
None


Unnamed: 0,id,call_account_fk,account_id
0,929665,569857,569857
1,776017,350739,350739
2,387856,270841,270841
3,216361,170096,170096
4,648713,54061,54061
...,...,...,...
99995,504896,131327,131327
99996,384558,171526,171526
99997,679096,406179,406179
99998,1030299,807488,807488


### Final conclusions

#### Call_reasons

We know that there is 145 uniques call_reasons.
We know that there is 100.000 uniques calls.
We know that some calls have no reason_id.
With a right join and a search, we know also that there is some reasons with no calls:
- Meter operator called customer
- SSE customer with no account
- No reply - DD Amended

So we can assume that calls have zero or one call_reason, and call_reason have many calls


#### Account

We know that there over 75000 uniques accounts
We know that there is 100.000 uniques calls.
We know there is no null in account_id in the table calls.

So we can assume that calls have one account, and accounts can have many calls.