# Fetch Data

We need to fetch the data from the warehouse we had earlier created and use it for analysis. This is usually the first step in any datascience workflow.



In [6]:
!pip install pandasql

Collecting pandasql
  Using cached pandasql-0.7.3.tar.gz (26 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py): started
  Building wheel for pandasql (setup.py): finished with status 'done'
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26818 sha256=a15f731ba1e3ce76074e301598ed58afe06c966e740011be530b16c185d2f717
  Stored in directory: c:\users\blais\appdata\local\pip\cache\wheels\a6\64\11\62d5f7b88421a5d80068ac9937fe2915e497da3eba900c8f59
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [10]:
#import necessary libraries

try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pymysql
    import pandasql as ps
    import contextlib
    print("Successfully installed libraries ...")
    
except ImportError as IE:
    print("Please install the missing libraries first before proceeding {}".format(IE))
    exit(1)

Successfully installed libraries ...


In [11]:
#define a couple of default styles we wish to rock with
plt.style.use('ggplot')
sns.set_palette('Reds_r')
sns.set_context('notebook')


In [14]:
#define pymysql query for easier querying of our dataframes

def mysql(query,connection: pymysql.connections.Connection):
    return pd.read_sql_query(query,connection)

def sql(query):
    return ps.sqldf(query)

## Database Connection

In [13]:
#creating a connection object 
#create a context manager to handle our sql connection
@contextlib.contextmanager
def sql_connection():
    '''
    Context manager to handle sql connection
    Args: 
        db: database name
    Yield:
        conn: sql connection
    '''
    try:
        conn = pymysql.connect(host='localhost',
                               user='root',
                               password='root',
                               port=3306,
                               db='dwh_whiskey',
                               
                               charset='utf8mb4'
                               )
        print("Successfully connected to database")
        # cursor = conn.cursor()
        yield conn
    except pymysql.MySQLError as e:
        print("Error connecting to database: {}".format(e))
        exit(1)
    finally:
        conn.close()
        print("Connection closed")
        

## Extract Data from the Database

In [15]:
#we use the context manager to get the connection object

with sql_connection() as conn:
    cursor=conn.cursor()
    
    #we'll be using read_sql_query from python to query our dataframe
    query= '''
    
    SELECT 
        f.date,
        d.Day_name as Day,
        d.Month_name as Month,
        d.Year_name as Year,
        f.Product_name as Product,
        f.Alcohol_Price,
        f.Alcohol_Percentage,
        f.Alcohol_Unit,
        c.full_name as customer_name,
        co.country as customer_country,
        f.credit_provider,
        e.full_name as employee_name
        
    FROM dwh_fact as f
    left join whiskey_retail_shop.customers c
    on f.customer_id = c.customer_id
    left join whiskey_retail_shop.countries as co
    on co.country_id = c.country_id 
    left join dwh_employees as e
    on e.employee_id = f.employee_id
    left join dwh_date d
    on d.Date_key = f.Date_key
    order by f.date   
    
    '''
    
    # Generating a Dataframe according to the query
    print("Running query")
    df = mysql(query,conn)

Successfully connected to database
Running query
Connection closed


In [16]:
df

Unnamed: 0,date,Day,Month,Year,Product,Alcohol_Price,Alcohol_Percentage,Alcohol_Unit,customer_name,customer_country,credit_provider,employee_name
0,1991-01-02,Wednesday,Jan,1991,Stauning BastardMezcal Finish,65.25,46.30,93.20,Debra Peron,Aruba,VISA 13 digit,Annie Holmes
1,1991-01-03,Thursday,Jan,1991,Black Gate Batch 1 3 Year OldTBWC Australian S...,99.95,46.00,199.90,George Craft,Niue,Discover,Ann Flores
2,1991-01-03,Thursday,Jan,1991,Cameronbridge 198437 Year Old Signatory,235.00,51.60,335.70,Richard Komorowski,Turkmenistan,VISA 16 digit,Denise Snell
3,1991-01-04,Friday,Jan,1991,Kavalan King Car Conductor,81.95,46.00,117.07,Jason Donohue,French Guiana,VISA 13 digit,Margaret Williams
4,1991-01-04,Friday,Jan,1991,North British Single Grain 201110 Year Old Wat...,44.95,57.10,64.20,Juan Irwin,Cayman Islands,VISA 16 digit,Jeffery Bonson
...,...,...,...,...,...,...,...,...,...,...,...,...
11319,2021-12-27,Monday,Dec,2021,Starward 2016 Sample4 Year Old Exclusive to Th...,7.95,57.00,26.50,Marlene Hall,Uganda,JCB 16 digit,Rose Laperle
11320,2021-12-28,Tuesday,Dec,2021,Hellyers Road Original 'Roaring 40s',53.45,40.00,76.36,Tracy Macauley,Isle of Man,Diners Club / Carte Blanche,Bert Shea
11321,2021-12-29,Wednesday,Dec,2021,Orbital 8 Year OldWorld Blend Sherry Cask Whis...,39.95,46.00,57.07,Amanda Jones,Italy,Mastercard,Grace Ponce
11322,2021-12-31,Friday,Dec,2021,Springbank 1966West Highland Malt Cask #442,15000.00,61.20,20.00,Herman Butler,Antigua and Barbuda,VISA 16 digit,Nedra Beitel


In [26]:
df.to_csv("../data/dwh_fact.csv",index=False)

# Data Analysis

In [17]:
#check and assert datatypes
df.dtypes

date                   object
Day                    object
Month                  object
Year                   object
Product                object
Alcohol_Price         float64
Alcohol_Percentage    float64
Alcohol_Unit          float64
customer_name          object
customer_country       object
credit_provider        object
employee_name          object
dtype: object

In [25]:
df.Alcohol_Price.dtype

dtype('float64')

In [19]:
def test_dates(data: pd.DataFrame,datecols: list):
    '''
    This function tests the dates in the data warehouse.
    '''
    for col in datecols:
        assert data[col].dtype == 'datetime64[ns]', "Failed date test for column: {}".format(col)

In [20]:
test_dates(df,['date'])

AssertionError: Failed date test for column: date