###### Goals:

* connect to the database
* query the tables in the database
* Activity: basic sql queries to explore the data


In [14]:
# import the libraries needed
import os
import sys
import pandas as pd
import psycopg2

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.extract import connect_to_redshift

# using this library for reading password
from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.filterwarnings('ignore')

## Connect to database

In [15]:

# import variables from .env file
dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")
aws_access_key_id = os.getenv("aws_access_key_id")
aws_secret_access_key_id = os.getenv("aws_secret_access_key_id")

In [16]:
connect = connect_to_redshift(dbname, host, port, user, password)

connection to redshift made


## Query the Tables

In [17]:
query= """select * 
          from bootcamp.online_transactions_cleaned          
       """ 

online_trans_cleaned = pd.read_sql(query, connect)

In [18]:
online_trans_cleaned.head(5)

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,536381,37444A,YELLOW BREAKFAST CUP AND SAUCER,2.95,1,2.95,2010-12-01 09:41:00,u15311,United Kingdom
1,536384,22189,CREAM HEART CARD HOLDER,3.95,4,15.8,2010-12-01 09:53:00,u18074,United Kingdom
2,536390,22960,JAM MAKING SET WITH JARS,3.75,12,45.0,2010-12-01 10:19:00,u17511,United Kingdom
3,536392,21891,TRADITIONAL WOODEN SKIPPING ROPE,1.25,12,15.0,2010-12-01 10:29:00,u13705,United Kingdom
4,536396,82494L,WOODEN FRAME ANTIQUE WHITE,2.55,12,30.6,2010-12-01 10:51:00,u1785,United Kingdom


In [19]:
online_trans_cleaned.shape

(399841, 9)

In [20]:
online_trans_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399841 entries, 0 to 399840
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   invoice            399841 non-null  object        
 1   stock_code         399841 non-null  object        
 2   description        399841 non-null  object        
 3   price              399841 non-null  float64       
 4   quantity           399841 non-null  int64         
 5   total_order_value  399841 non-null  float64       
 6   invoice_date       399841 non-null  datetime64[ns]
 7   customer_id        399841 non-null  object        
 8   country            399841 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 27.5+ MB


In [30]:
online_trans_cleaned.isnull()

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
399836,False,False,False,False,False,False,False,False,False
399837,False,False,False,False,False,False,False,False,False
399838,False,False,False,False,False,False,False,False,False
399839,False,False,False,False,False,False,False,False,False


In [31]:
online_trans_cleaned.isnull().sum()

invoice              0
stock_code           0
description          0
price                0
quantity             0
total_order_value    0
invoice_date         0
customer_id          0
country              0
dtype: int64

## SQL Query

#####  1. How many invoices does the table contain?

In [21]:
query = """select count(invoice) as number_of_invoices
           from bootcamp.online_transactions_cleaned
        """
pd.read_sql(query, connect)

Unnamed: 0,number_of_invoices
0,399841


##### 2. When was the first and last purchase?

In [22]:
query = """select MIN(invoice_date) AS first_purchase,
           MAX(invoice_date) AS last_purchase
           from bootcamp.online_transactions_cleaned 
        """
pd.read_sql(query, connect)

Unnamed: 0,first_purchase,last_purchase
0,2010-12-01 08:26:00,2011-12-09 12:50:00


##### 3.How many customers does the table contain?

In [23]:
query = """select count(Distinct customer_id) As total_customers
           from bootcamp.online_transactions_cleaned
        """
pd.read_sql(query, connect)

Unnamed: 0,total_customers
0,4363


##### 4. How many different types of stocks did the customer purchase?  

In [24]:
query = """select count(Distinct stock_code) as num_of_stock_types
           from bootcamp.online_transactions_cleaned
        """
pd.read_sql(query, connect)

Unnamed: 0,num_of_stock_types
0,3679


##### 5. What is the most popular stock? (You can look at top 10 sold items - does this differ across markets?)

In [26]:
query = """select stock_code, description, country, SUM(quantity) AS total_sold_quantity, 
           SUM(total_order_value) AS total_revenue 
           from bootcamp.online_transactions_cleaned
           group by stock_code, description, country
           order by total_sold_quantity desc
           limit 10
        """
pd.read_sql(query, connect)

Unnamed: 0,stock_code,description,country,total_sold_quantity,total_revenue
0,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,United Kingdom,47886,11856.04
1,22197,POPCORN HOLDER,United Kingdom,45194,34090.58
2,85099B,JUMBO BAG RED RETROSPOT,United Kingdom,40777,75236.43
3,84879,ASSORTED COLOUR BIRD ORNAMENT,United Kingdom,32580,52147.56
4,85123A,CREAM HANGING HEART T-LIGHT HOLDER,United Kingdom,32136,88416.2
5,22616,PACK OF 12 LONDON TISSUES,United Kingdom,24297,6920.49
6,17003,BROCADE RING PURSE,United Kingdom,22672,5718.69
7,21212,PACK OF 72 RETROSPOT CAKE CASES,United Kingdom,22182,10856.22
8,22178,VICTORIAN GLASS HANGING T-LIGHT,United Kingdom,21427,27350.31
9,21977,PACK OF 60 PINK PAISLEY CAKE CASES,United Kingdom,19882,9438.0


#### 6. What is the average order value i.e. price * quantity?


In [27]:
query = """select avg(total_order_value) As avg_order_value
           from bootcamp.online_transactions_cleaned
        """
pd.read_sql(query, connect)

Unnamed: 0,avg_order_value
0,20.716904


#### 7. How many Stocks have the Description “Unknown”? How will you handle this when building customer segments? 

In [28]:
query = """ select count(description) As num_unknown_stocks
             from bootcamp.online_transactions_cleaned 
             where description = 'Unknown'
        """
pd.read_sql(query,connect)

Unnamed: 0,num_unknown_stocks
0,1172
