# Import software libraries

In [None]:
# Import required libraries.
import sys           # Read system parameters.
import pandas as pd  # Manipulate and analyze data.
import sqlite3       # Manage SQL databases.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- pandas {}'.format(pd.__version__))
print('- sqlite3 {}'.format(sqlite3.sqlite_version))

# Examine the database

In [None]:
# Connect to SQLite database.
conn = sqlite3.connect('./data/prod_sample.db')


In [None]:
# List all the tables in the database.
query = '''
    SELECT
        name
    FROM
        sqlite_master
    WHERE
        type='table';
'''

table = pd.read_sql(query, conn)
table



# Read data from the `online_retail_history` table

In [None]:
# Write the query to be executed that selects everything from the online_retail_history table.
query = '''
    SELECT
        *
    FROM
        online_retail_history;
'''



# Use the read_sql function in pandas to read a query into a DataFrame.
data_orh = pd.read_sql(query, conn)


# Preview the first five rows of the data.
data_orh.head()


In [None]:
# Get the shape of the data.
data_orh.shape


# Read data from the `stock_description` table

In [None]:
# Write the query to be executed that selects everything from the online_retail_history table.
query = '''
    SELECT
        *
    FROM
        stock_description;
'''



# Use the read_sql function in pandas to read a query into a DataFrame.
data_sd = pd.read_sql(query, conn)


# Preview the first five rows of the data.
data_sd.head()

In [None]:
# Get the shape of the data.
data_sd.shape

# Aggregate the `online_retail_history` and `stock_description` datasets

In [None]:
# Write a query to aggregate the two datasets so that you have the stock descriptions as well as the stock code.
query = '''
    SELECT
        o.*,
        s.Description
    FROM
        online_retail_history o
        JOIN stock_description s ON o.StockCode=s.StockCode
'''






# Use the read_sql function in pandas to read a query into a DataFrame.
two_data = pd.read_sql(query, conn)


# Preview the first five rows of the data.

two_data.head()

In [None]:
# Get the shape of the data.
two_data.shape


# Identify and fix corrupt or unusable data

In [None]:
# Check the value counts of the "Description" field.
two_data['Description'].value_counts()

In [None]:
# Remove rows where "Description" is just a question mark (?).
two_data2 = two_data[two_data['Description'] != '?']






# Preview the first five rows of the data.
two_data2.head()
print(two_data2.shape)

# Identify and remove duplicates

In [None]:
# Identify all duplicated data.
data_dup = two_data2[two_data2.duplicated()]




In [None]:
# Print the duplicated data.
print(data_dup)


In [None]:
# Remove the duplicated data.
two_data3 = two_data2.drop_duplicates()






# Preview the first five rows of the data.
two_data3.head()
print(two_data3.shape)

# Correct date formats

In [None]:
# Get the data types for every column in the DataFrame.
two_data3.dtypes # or two_data3.info()

In [None]:
# Convert "InvoiceDate" to a "%Y-%m-%d" datetime format.
two_data4 = two_data3[:]

two_data4['InvoiceDate'] = pd.to_datetime(two_data4['InvoiceDate'], format='%Y-%m-%d')

In [None]:
# Get the data types for every column in the converted DataFrame.
two_data4.dtypes


# Examine the table before finishing

In [None]:
# Preview the first five rows of the data.

two_data4.head()

# Load the dataset into a pickle file

In [None]:
# Save the dataset as a pickle file named online_history_cleaned.pickle.

two_data4.to_pickle("online_history_cleaned.pickle")

In [None]:
# Close any connections to the database.
conn.close()
