In [2]:
import os
import pandas as pd
import numpy as np
import env
from sklearn.model_selection import train_test_split

def get_connection(db, user=env.user, host=env.host, password=env.password):
    '''
    function to generate a url for querying the codeup database
    accepts a database name (string) and requires an env.py file with 
    username, host, and password.

    Returns an url as a string  
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [11]:
def get_superstore():
    """
    Retrieve locally cached data .csv file for the superstore dataset
    If no locally cached file is present retrieve the data from the codeup database server
    Keyword arguments: none
    Returns: DataFrame

    """
    filename = "superstore.csv"

    # if file is available locally, read it
    if os.path.isfile(filename):
        return pd.read_csv(filename)
    
    else:
    # if file not available locally, acquire data from SQL database
    # and write it as csv locally for future use 
        df = pd.read_sql('''
                            select * from orders
                            join customers USING (`Customer ID`)
                            join categories USING (`Category ID`)
                            join products USING (`Product ID`)
                            join regions USING (`Region ID`)
                         ''', get_connection('superstore_db'))

    # Write that dataframe to disk for later. This cached file will prevent repeated large queries to the database server.
    df.to_csv(filename, index=False)
    return df

In [12]:
df = get_superstore()

In [24]:
# lower case column names and add underscores
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()

In [27]:
df.order_date = pd.to_datetime(df.order_date)

In [28]:
df.ship_date = pd.to_datetime(df.ship_date)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1734 entries, 0 to 1733
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   region_id      1734 non-null   int64         
 1   product_id     1734 non-null   object        
 2   category_id    1734 non-null   int64         
 3   customer_id    1734 non-null   object        
 4   order_id       1734 non-null   object        
 5   order_date     1734 non-null   datetime64[ns]
 6   ship_date      1734 non-null   datetime64[ns]
 7   ship_mode      1734 non-null   object        
 8   segment        1734 non-null   object        
 9   country        1734 non-null   object        
 10  city           1734 non-null   object        
 11  state          1734 non-null   object        
 12  postal_code    1734 non-null   float64       
 13  sales          1734 non-null   float64       
 14  quantity       1734 non-null   float64       
 15  discount       1734 n

In [None]:
def prepare_superstore(df):
    # lower case column names and add underscores
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.lower()