STEP #0: Import Libraries

In [2]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

STEP #1: Initial Exploration

In [None]:
# Load data
df = pd.read_csv('../data/raw/data.csv', encoding='ISO-8859-1')

# First look - shape and first few rows
print('Shape:', df.shape)
print('\nFirst 10 rows:')
display(df.head(10))

# Column info
print('\nColumn info:')
display(df.info())

# Basic Statistics - describe
print('\nBasic Statics')
display(df.describe())
# Missing values
display(df.isnull().sum())

# Unique values per column
print('Unique values per column:')
for col in df:
    display(f'\n{col}: {df[col].nunique()}')


541909 entries, total 8 columns; Missing values: Description (1454), CustomerID (135080), others are OK; 

STEP #2: Data Quality Check

In [None]:
# Check for problems

# 1. Missing CustomerIDs
print(f"Missing customer IDs: {df['CustomerID'].isnull().sum()}")
print(f"Percentage: {df['CustomerID'].isnull().sum() / len(df) * 100:.2f}%")

# 2. Negative quantities (returns/cancellations)
print(f"Negative quantities: {(df['Quantity'] < 0).sum()}")
print(f"Negative prices: {(df['UnitPrice'] < 0).sum()}")

# 3. Cancelled orders (check InvoiceNo starting with 'C')
if df['InvoiceNo'].dtype == 'object':
    cancelled = df['InvoiceNo'].astype(str).str.startswith('C').sum()
    print(f"Cancelled invoices: {cancelled}")

# 4. Countries
print(f"\nCountries represented: {df['Country'].nunique()}")
print(f"\nTop Countries by transaction: {df['Country'].value_counts().head(11)}")

# 5. Date range
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
print(f"Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")

# 6. Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")


STEP #3: Create SQLite Database

In [56]:
# Create database connection
conn = sqlite3.connect('../data/ecommerce.db')

# Load raw data into SQL
df.to_sql('transaction_raw', conn, if_exists = 'replace', index = False)

# Test it works
test_query = " SELECT COUNT(*) as total_rows FROM transaction_raw"
result = pd.read_sql(test_query, conn)
print(f"Rows in database: {result['total_rows'][0]}")

Rows in database: 541909


STEP #4: Exploratory SQL queries

In [64]:
# Query 1: Total transactions and revenue
query1 = """
    SELECT
        COUNT(DISTINCT InvoiceNo) as total_invoices,
        COUNT(*) as total_line_items,
        SUM(Quantity * UnitPrice) as total_revenue
    FROM transaction_raw
    WHERE Quantity > 0 AND UnitPrice > 0
"""
print("Overall statistics:")
display(pd.read_sql(query1, conn))

# Query 2: Revenue by country
query2 = """
SELECT
    Country,
    COUNT(DISTINCT InvoiceNo) as num_orders,
    SUM(Quantity * UnitPrice) as revenue
FROM transaction_raw
WHERE Quantity > 0 AND UnitPrice > 0
GROUP by Country
ORDER by revenue DESC
LIMIT 10
"""
print("\nTop 10 countries by revenue:")
display(pd.read_sql(query2, conn))


# Query 3: Top products
query3 = """
SELECT
    Description,
    SUM(Quantity) as units_sold,
    SUM(Quantity * UnitPrice) as revenue
FROM transaction_raw
WHERE Quantity > 0 AND UnitPrice > 0
GROUP by Description
ORDER by revenue DESC
LIMIT 10
"""
print("\nTop 10 products:")
display(pd.read_sql(query3, conn))

# Query 4: Monthly revenue trend
query4 = """
SELECT
    strftime('%Y-%m', InvoiceDate) as month,
    SUM(Quantity * UnitPrice) as revenue
FROM transaction_raw
WHERE Quantity > 0 AND UnitPrice > 0
GROUP by month
ORDER by month
"""
print("\nMonthly revenue:")
display(pd.read_sql(query4, conn))

# Query 5: Customer count
query5 = """
SELECT
    COUNT(DISTINCT CustomerID) as unique_customers
FROM transaction_raw
WHERE CustomerID IS NOT NULL
"""
print("\nUnique customers:")
display(pd.read_sql(query5, conn))


Overall statistics:


Unnamed: 0,total_invoices,total_line_items,total_revenue
0,19960,530104,10666680.0



Top 10 countries by revenue:


Unnamed: 0,Country,num_orders,revenue
0,United Kingdom,18019,9025222.084
1,Netherlands,94,285446.34
2,EIRE,288,283453.96
3,Germany,457,228867.14
4,France,392,209715.11
5,Australia,57,138521.31
6,Spain,90,61577.11
7,Switzerland,54,57089.9
8,Belgium,98,41196.34
9,Sweden,36,38378.33



Top 10 products:


Unnamed: 0,Description,units_sold,revenue
0,DOTCOM POSTAGE,706,206248.77
1,REGENCY CAKESTAND 3 TIER,13879,174484.74
2,"PAPER CRAFT , LITTLE BIRDIE",80995,168469.6
3,WHITE HANGING HEART T-LIGHT HOLDER,37891,106292.77
4,PARTY BUNTING,18295,99504.33
5,JUMBO BAG RED RETROSPOT,48474,94340.05
6,MEDIUM CERAMIC TOP STORAGE JAR,78033,81700.92
7,Manual,7225,78112.82
8,POSTAGE,3150,78101.88
9,RABBIT NIGHT LIGHT,30788,66964.99



Monthly revenue:


Unnamed: 0,month,revenue
0,2010-12,823746.14
1,2011-01,691364.56
2,2011-02,523631.89
3,2011-03,717639.36
4,2011-04,537808.621
5,2011-05,770536.02
6,2011-06,761739.9
7,2011-07,719221.191
8,2011-08,759138.38
9,2011-09,1058590.172



Unique customers:


Unnamed: 0,unique_customers
0,4372
