In [1]:
import numpy as np
import os
import pandas as pd

# Acquire Some Cache

- "Flushing the cache" means to remove the old cached version and make a new one.
- FYI: The fruits table had a new record added on 2020-09-08.

## Orientation
- The first way shows a direct path to making a cached copy of query results
- The second way shows how to give the calling code some control over flushing the cache
- The 3rd way shows how to flush a cache on a condition
- The 4th way shows how to compare the modification dates of the DB table and the CSV file and automatically decide

![CacheAcquire](https://raw.githubusercontent.com/CodeupClassroom/darden-classification-exercises/master/viz/acquire_with_cache.svg)

In [2]:
# Helper Function
from env import get_url

## Simple Caching Example
- Make one single acquisition function 
- If you need to refresh the data, delete the cached copy from your filesystem

In [3]:
def get_data_from_csv(filename):
    return pd.read_csv(filename)

In [4]:
def get_data_from_sql(filename):
    """Queries the fruits_db to make a dataframe, writes the dataframe, and returns the datafram"""
    
    sql = """SELECT * FROM fruits"""
    url = get_url("fruits_db")
    
    df = pd.read_sql(sql, url)
    df.to_csv(filename)
    return df

In [5]:
def get_fruit_data1():
    """
    Returns a dataframe of fruits
    Reads from fruits.csv file if file exists
    If fruits.csv file does not exist, this function creates it, then returns the dataframe of fruits
    """
    
    filename = "fruits.csv"

    if os.path.isfile(filename):
        return get_data_from_csv(filename)
    else:
        return get_data_from_sql(filename)

In [6]:
df = get_fruit_data1()
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,quantity
0,0,1,apple,3
1,1,2,banana,4
2,2,3,cantelope,16
3,3,4,dragonfruit,1
4,4,5,elderberry,2


## Give the Calling Code the Opportunity to Flush the Cache
- The use of the default argument for `flush` gives the calling code the opportunity to flush the cache

In [7]:
def get_fruit_data2(flush=False):
    """
    Returns a dataframe of fruits.
    To flush the cache and make a new file, set flush = True.
    If flush is false, this returns the cached copy.
    If flush is True, this makes the query, writes the csv results, and returns the dataframe
    If fruits.csv file does not exist, this function creates it, then returns the dataframe of fruits
    """
        
    filename = "fruits.csv"
    
    # If the filename doesn't exist or if we're flushing, then 
    if flush or os.path.isfile(filename) == False:
        return get_data_from_sql(filename)
    else:
        return get_data_from_csv(filename)


In [8]:
# How long does it take to run the query from cache
%timeit get_fruit_data2()

1.98 ms ± 347 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
# How long does it take to run the entire query
%timeit get_fruit_data2(flush=True)

1.01 s ± 56.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Flush the cache and make a new copy on a condition
- For example, if we want to flush the cache (and get a new copy) after a given number of days or a specific theshold, consider the following:

In [10]:
# This SQL outputs the number of days since the last update to the table (or a database) 
# We can get the creation time of the table, too

sql = """
select datediff(curdate(), update_time) as `days_since_update`
from information_schema.tables
where table_schema = "fruits_db"
and table_name = "fruits";
"""
url = get_url("fruits_db")

df = pd.read_sql(sql, url)
df

Unnamed: 0,days_since_update
0,2


In [11]:
def get_days_since_last_update():
    sql = """
    select datediff(curdate(), update_time) as `days_since_update`, datediff(curdate(), create_time) as `days_since_creation`
    from information_schema.tables
    where table_schema = "fruits_db"
    and table_name = "fruits";
    """
    url = get_url("fruits_db")

    df = pd.read_sql(sql, url)
    df = df.fillna(value=np.nan)

    return df.days_since_update[0]

In [12]:
def get_fruit_data3(flush=False, days_threshold=7):

    filename = "fruits.csv"

    days_since_last_update = get_days_since_last_update()

    # If The number of days since last update is longer than the threshold argument, 
    # Set flush to True and then we'll blow out the old copy and refresh the cache.
    if days_since_last_update > days_threshold:
        flush = True

    if flush or os.path.isfile(filename) == False:
        return get_data_from_sql(filename)
    else:
        return get_data_from_csv(filename)

In [13]:
df = get_fruit_data3()
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,quantity
0,0,1,apple,3
1,1,2,banana,4
2,2,3,cantelope,16
3,3,4,dragonfruit,1
4,4,5,elderberry,2


## Flush the Cache Based On Modification Dates of the table and your file
- If the table was updated more recently than your file, then flush the cache and get a new copy

In [14]:
import time

def get_filename_modification_date(filename):
    # seconds since unix epoch time
    modification_time_since_epoch = os.path.getmtime(filename)
    
    # convert those seconds into a timestamp (and set the timezone to GMT)
    modification_timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(modification_time_since_epoch))
    return modification_timestamp

get_filename_modification_date("fruits.csv")

'2020-09-10 19:10:59'

In [15]:
def get_both_modification_dates(filename):

    sql = """
        select update_time as table_updated_at
        from information_schema.tables
        where table_schema = "fruits_db"
        and table_name = "fruits";
        """
    url = get_url("fruits_db")
    df = pd.read_sql(sql, url)
    
    df["file_updated_at"] = pd.to_datetime(get_filename_modification_date(filename))

    return df

# Check out the results manually
get_both_modification_dates("fruits.csv")

Unnamed: 0,table_updated_at,file_updated_at
0,2020-09-08 20:52:04,2020-09-10 19:10:59


In [16]:
def get_fruit_data4(flush = False):
    filename = "fruits.csv"    
    
    time_df = get_both_modification_dates(filename)
    
    # If the DB table's update is newer than the file, then it's time to renew.
    flush = time_df.table_updated_at[0] > time_df.file_updated_at[0]
    
    if flush or os.path.isfile(filename) == False:
        return get_data_from_sql(filename)
    else:
        return get_data_from_csv(filename)    

In [17]:
fruits = get_fruit_data4()
fruits

Unnamed: 0.1,Unnamed: 0,id,name,quantity
0,0,1,apple,3
1,1,2,banana,4
2,2,3,cantelope,16
3,3,4,dragonfruit,1
4,4,5,elderberry,2
5,5,6,mango,57
