# Import software libraries

In [1]:
# Import required libraries.
import sys           # Read system parameters.
import pandas as pd  # Manipulate and analyze data.
import sqlite3       # Manage SQL databases.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- pandas {}'.format(pd.__version__))
print('- sqlite3 {}'.format(sqlite3.sqlite_version))

Libraries used in this project:
- Python 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0]
- pandas 1.4.4
- sqlite3 3.31.1


# Examine the database

In [2]:
conn = sqlite3.connect("/content/prod_sample.db")

In [3]:
# List all the tables in the database.
query = """SELECT name FROM sqlite_master WHERE type = "table";"""
table = pd.read_sql(query, conn)
table

Unnamed: 0,name
0,stock_description
1,online_retail_history


# Read data from the `online_retail_history` table

In [4]:
query = """SELECT * FROM online_retail_history"""

# Use the read_sql function in pandas to read a query into a DataFrame
data_orh = pd.read_sql(query, conn)

data_orh.head()

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country,TotalAmount
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom,15.3
1,536367,84879,32,2010-12-01 08:34:00,1.69,u13047,United Kingdom,54.08
2,536373,85123A,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,15.3
3,536375,85123A,6,2010-12-01 09:32:00,2.55,u1785,United Kingdom,15.3
4,536378,20725,10,2010-12-01 09:37:00,1.65,u14688,United Kingdom,16.5


In [5]:
data_orh.shape

(15321, 8)

# Read data from the `stock_description` table

In [6]:
query = """SELECT * FROM stock_description;"""

data_sd = pd.read_sql(query, conn)

data_sd.head()

Unnamed: 0,StockCode,Description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE


In [7]:
data_sd.shape

(3952, 2)

In [8]:
# Write a query to aggregate the two datasets so that you have the stock descriptions as well as the stock code.
query = '''
    SELECT
        o.*,
        s.Description
    FROM
        online_retail_history o
        JOIN stock_description s ON o.StockCode=s.StockCode
'''

two_data = pd.read_sql(query, conn)
two_data.head()

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country,TotalAmount,Description
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
1,536367,84879,32,2010-12-01 08:34:00,1.69,u13047,United Kingdom,54.08,ASSORTED COLOUR BIRD ORNAMENT
2,536373,85123A,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
3,536375,85123A,6,2010-12-01 09:32:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
4,536378,20725,10,2010-12-01 09:37:00,1.65,u14688,United Kingdom,16.5,LUNCH BAG RED RETROSPOT


In [9]:
two_data.shape

(17032, 9)

In [10]:
two_data["Description"].value_counts()

CREAM HANGING HEART T-LIGHT HOLDER    2174
JUMBO BAG RED RETROSPOT               1960
?                                     1711
REGENCY CAKESTAND 3 TIER              1711
PARTY BUNTING                         1615
LUNCH BAG RED RETROSPOT               1421
ASSORTED COLOUR BIRD ORNAMENT         1405
POPCORN HOLDER                        1329
LUNCH BAG  BLACK SKULL.               1271
SET OF 3 CAKE TINS PANTRY DESIGN      1257
PACK OF 72 RETROSPOT CAKE CASES       1178
Name: Description, dtype: int64

In [11]:
two_data2 = two_data[two_data["Description"] != "?"]

In [12]:
two_data2.head()

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country,TotalAmount,Description
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
1,536367,84879,32,2010-12-01 08:34:00,1.69,u13047,United Kingdom,54.08,ASSORTED COLOUR BIRD ORNAMENT
2,536373,85123A,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
3,536375,85123A,6,2010-12-01 09:32:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
4,536378,20725,10,2010-12-01 09:37:00,1.65,u14688,United Kingdom,16.5,LUNCH BAG RED RETROSPOT


In [13]:
two_data2.shape

(15321, 9)

In [14]:
data_dup = two_data2[two_data2.duplicated()]

In [16]:
data_dup

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country,TotalAmount,Description
178,536863,20727,1,2010-12-03 11:19:00,1.65,u17967,United Kingdom,1.65,LUNCH BAG BLACK SKULL.
497,537781,84879,8,2010-12-08 12:46:00,1.69,u17341,United Kingdom,13.52,ASSORTED COLOUR BIRD ORNAMENT
571,537955,20725,1,2010-12-09 11:28:00,1.65,u16782,United Kingdom,1.65,LUNCH BAG RED RETROSPOT
935,539092,22423,16,2010-12-16 10:08:00,10.95,u15482,United Kingdom,175.20,REGENCY CAKESTAND 3 TIER
1068,539475,22197,1,2010-12-19 14:41:00,0.85,u16686,United Kingdom,0.85,POPCORN HOLDER
...,...,...,...,...,...,...,...,...,...
16542,580048,20727,1,2011-12-01 12:53:00,1.65,u12748,United Kingdom,1.65,LUNCH BAG BLACK SKULL.
16671,580469,20727,2,2011-12-04 12:32:00,1.65,u14583,United Kingdom,3.30,LUNCH BAG BLACK SKULL.
16673,580469,20727,1,2011-12-04 12:32:00,1.65,u14583,United Kingdom,1.65,LUNCH BAG BLACK SKULL.
16716,580611,20727,1,2011-12-05 11:49:00,1.65,u12748,United Kingdom,1.65,LUNCH BAG BLACK SKULL.


In [17]:
two_data3 = two_data2.drop_duplicates()

In [18]:
two_data3.head()

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country,TotalAmount,Description
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
1,536367,84879,32,2010-12-01 08:34:00,1.69,u13047,United Kingdom,54.08,ASSORTED COLOUR BIRD ORNAMENT
2,536373,85123A,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
3,536375,85123A,6,2010-12-01 09:32:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
4,536378,20725,10,2010-12-01 09:37:00,1.65,u14688,United Kingdom,16.5,LUNCH BAG RED RETROSPOT


In [19]:
two_data3.shape

(15206, 9)

In [20]:
two_data3.dtypes

Invoice         object
StockCode       object
Quantity         int64
InvoiceDate     object
Price          float64
CustomerID      object
Country         object
TotalAmount    float64
Description     object
dtype: object

In [22]:
# Convert "InvoiceDate" to a "%Y-%m-%d" datetime format.
two_data4 = two_data3[:]

two_data4['InvoiceDate'] = pd.to_datetime(two_data4['InvoiceDate'], format='%Y-%m-%d')

In [23]:
two_data4.dtypes

Invoice                object
StockCode              object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
CustomerID             object
Country                object
TotalAmount           float64
Description            object
dtype: object

In [24]:
two_data4.head()

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country,TotalAmount,Description
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
1,536367,84879,32,2010-12-01 08:34:00,1.69,u13047,United Kingdom,54.08,ASSORTED COLOUR BIRD ORNAMENT
2,536373,85123A,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
3,536375,85123A,6,2010-12-01 09:32:00,2.55,u1785,United Kingdom,15.3,CREAM HANGING HEART T-LIGHT HOLDER
4,536378,20725,10,2010-12-01 09:37:00,1.65,u14688,United Kingdom,16.5,LUNCH BAG RED RETROSPOT


In [25]:
two_data4.to_pickle("online_history_cleaned.pickle")

In [26]:
conn.close()