# Superstore Sales Data Modelling and Analysis in SQL

In [149]:
import pandas as pd

In [150]:
data = pd.read_csv('Superstore_data.csv')
df = pd.DataFrame(data)

In [151]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368


In [152]:
list(df.columns)

['Row ID',
 'Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Customer ID',
 'Customer Name',
 'Segment',
 'Country',
 'City',
 'State',
 'Postal Code',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name',
 'Sales']

From this list of columns, we can see that this sales data can be normalized. <br>
This means:
1. Customer ID, Customer Name, Segment, Country, City, State, Postal Code can become a separate table named "Customers"
2. Product ID, Category, and Sub-Category can become a separate table named "Products"

Also, we can also notice that we do not really need Row ID. The column Invoice ID is good enough column to become primary key for "Sales" table.

# Data Infrastructure

As the name of the project suggests, I will be transferring the data to SQL. For that, I will have to use appropriate libraries in python to create a connection with SQL database. 

In [153]:
# importing connector
import mysql.connector
import os
from dotenv import load_dotenv

In [154]:
load_dotenv()
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')

# Establishing the connection
try:
    conn = mysql.connector.connect(
        host="localhost",
        user= db_user,
        password= db_password,
    )
except mysql.connector.Error as e:
    print(f"Error: {e}")

In [155]:
# creating cursor
cursor = conn.cursor()

In [156]:
# Testing the connection
# Let us list all the Schemas/Databases in MySQL
cursor.execute("SHOW DATABASES")
dbs = cursor.fetchall()
for db in dbs:
    print(db)

('information_schema',)
('mysql',)
('parks_and_recreation',)
('performance_schema',)
('sakila',)
('superstore_db',)
('sys',)
('world',)


Our connection is set up. Now, Let us create separate Database for our Superstore Data.

In [157]:
# creating Database
cursor.execute("CREATE SCHEMA IF NOT EXISTS superstore_db")

Now our Data Infrastructure is set up. Let us move to Data modelling now.

# Data Modelling
Now we will be creating the tables we discussed above:
1. Products
2. Customers
3. Sales

For making Data modelling easier, I will be using [Quick DBD](https://www.quickdatabasediagrams.com/) platform which is efficient for data modelling in SQL.<br>The schema generated is present in the "quickDB_schema.txt" file

In [158]:
# Selecting the database
cursor.execute("USE superstore_db")

In [159]:
# Creating table "Products"
cursor.execute("DROP TABLE IF EXISTS Products")
cursor.execute("""
               CREATE TABLE `Products` (
                `product_id` VARCHAR(255)  NOT NULL ,
                `category` VARCHAR(255)  NOT NULL ,
                `sub_category` VARCHAR(255)  NOT NULL ,
                `product_name` VARCHAR(255)  NOT NULL ,
                PRIMARY KEY (
                    `product_id`
                )
            );
            """)

In [160]:
# Creating table "Customers"
cursor.execute("DROP TABLE IF EXISTS Customers")
cursor.execute("""
                CREATE TABLE `Customers` (
                    `customer_id` VARCHAR(255)  NOT NULL ,
                    `customer_name` VARCHAR(255)  NOT NULL ,
                    `segment` VARCHAR(255)  NOT NULL ,
                    `city` VARCHAR(255)  NOT NULL ,
                    `state` VARCHAR(255)  NOT NULL ,
                    `country` VARCHAR(255)  NOT NULL ,
                    `postal` INT  NOT NULL ,
                    `region` VARCHAR(255)  NOT NULL ,
                    PRIMARY KEY (
                        `customer_id`
                    )
                );
                """)

In [197]:
# Creating table "Sales"
cursor.execute("DROP TABLE IF EXISTS Sales")
cursor.execute("""
                CREATE TABLE `Sales` (
                `order_id` VARCHAR(255)  NOT NULL ,
                `order_date` DATE  NOT NULL ,
                `ship_date` DATE  NOT NULL ,
                `ship_method` VARCHAR(255)  NOT NULL ,
                `customer_id` VARCHAR(255)  NOT NULL ,
                `product_id` VARCHAR(255)  NOT NULL ,
                `sales` INT  NOT NULL ,
                PRIMARY KEY (
                    `order_id`,`product_id`,`sales`
                )
            );
            """)

In [162]:
# Altering tables to add foreign keys
cursor.execute("""
                ALTER TABLE `Sales` ADD FOREIGN KEY (`customer_id`) REFERENCES `Customers`(`customer_id`);
                """)

cursor.execute("""
                ALTER TABLE `Sales` ADD FOREIGN KEY (`product_id`) REFERENCES `Products`(`product_id`);
                """)

# Adding constraints for unique ids in products and customers
cursor.execute("""
                ALTER TABLE `Products` ADD UNIQUE (`product_id`);
                """)
cursor.execute("""
                ALTER TABLE `Customers` ADD UNIQUE (`customer_id`);
                """)

In [163]:
cursor.execute("SHOW TABLES")

In [164]:
# Displaying Tables
tables = cursor.fetchall()
for table in tables:
    print(table[0])
    cursor.execute(f"DESCRIBE {table[0]}")
    for row in cursor.fetchall():
        print(row)
    print("-----------------")


customers
('customer_id', 'varchar(255)', 'NO', 'PRI', None, '')
('customer_name', 'varchar(255)', 'NO', '', None, '')
('segment', 'varchar(255)', 'NO', '', None, '')
('city', 'varchar(255)', 'NO', '', None, '')
('state', 'varchar(255)', 'NO', '', None, '')
('country', 'varchar(255)', 'NO', '', None, '')
('postal', 'int', 'NO', '', None, '')
('region', 'varchar(255)', 'NO', '', None, '')
-----------------
products
('product_id', 'varchar(255)', 'NO', 'PRI', None, '')
('category', 'varchar(255)', 'NO', '', None, '')
('sub_category', 'varchar(255)', 'NO', '', None, '')
('product_name', 'varchar(255)', 'NO', '', None, '')
-----------------
sales
('order_id', 'varchar(255)', 'NO', 'PRI', None, '')
('order_date', 'date', 'NO', '', None, '')
('ship_date', 'date', 'NO', '', None, '')
('ship_method', 'varchar(255)', 'NO', '', None, '')
('customer_id', 'varchar(255)', 'NO', 'MUL', None, '')
('product_id', 'varchar(255)', 'NO', 'PRI', None, '')
('sales', 'int', 'NO', '', None, '')
--------------

The tables have been generated. Now, it is time to populate the tables as per the Data we have in our excel sheet. Let us remove the Null values using python because this is going to become a problem while data transfer.

# Data Cleaning with Python
We are going to remove only the null values using python. We will perform rest of the cleaning with SQL. 

In [192]:
print(len(df))
df.isnull().sum()

9800


Row ID            0
Order ID          0
Order Date        0
Ship Date         0
Ship Mode         0
Customer ID       0
Customer Name     0
Segment           0
Country           0
City              0
State             0
Postal Code      11
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
dtype: int64

We have 9800 rows out of which only 11 rows have null values. We can remove these rows. It will not make much difference.

In [194]:
df.dropna(inplace=True)
print(len(df))

9789


Done

# Database Population
We will be parsing each row in the excel file and keep on populating our SQL database.

In [165]:
# Checking the columns in dataframe
for col in df.columns:
    print(col)

Row ID
Order ID
Order Date
Ship Date
Ship Mode
Customer ID
Customer Name
Segment
Country
City
State
Postal Code
Region
Product ID
Category
Sub-Category
Product Name
Sales


In [166]:
product_cols = ['Product ID', 'Category', 'Sub-Category', 'Product Name']
customer_cols = ['Customer ID', 'Customer Name', 'Segment', 'City', 'State', 'Country', 'Postal Code', 'Region']
sale_cols = ['Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 'Product ID', 'Sales']

In [167]:
# Checking Types of columns
for col in df.columns:
    print(col, df[col].dtype)

Row ID int64
Order ID object
Order Date object
Ship Date object
Ship Mode object
Customer ID object
Customer Name object
Segment object
Country object
City object
State object
Postal Code float64
Region object
Product ID object
Category object
Sub-Category object
Product Name object
Sales float64


In [169]:
# Fixing Date columns
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d/%m/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%d/%m/%Y')

Let us populate the data

In [203]:
cursor.execute("DELETE FROM Products")
cursor.execute("DELETE FROM Customers")
cursor.execute("DELETE FROM Sales")


for index,row in df.iterrows():
    product_id, category, sub_category, product_name = row[product_cols]
    customer_id, customer_name, segment, city, state, country, postal, region = row[customer_cols]
    order_id, order_date, ship_date, ship_method, customer_id, product_id, sales = row[sale_cols]
    # replacing double quotes with single quotes to make parsing easi
    product_name = product_name.replace('"', "'")

    cursor.execute(f"""
                    INSERT INTO Products (product_id, category, sub_category, product_name)
                    VALUES ('{product_id}', '{category}', '{sub_category}', "{product_name}")
                    ON DUPLICATE KEY UPDATE category = '{category}', sub_category = '{sub_category}', product_name = "{product_name}";
                   """)

    cursor.execute(f"""
                    INSERT INTO Customers (customer_id, customer_name, segment, city, state, country, postal, region)
                    VALUES ('{customer_id}', "{customer_name}", '{segment}', '{city}', '{state}', '{country}', '{postal}', '{region}')
                    ON DUPLICATE KEY UPDATE customer_name = "{customer_name}", segment = '{segment}', city = '{city}', state = '{state}', country = '{country}', postal = '{postal}', region = '{region}';
                   """)
        
    cursor.execute(f"""
                    INSERT INTO Sales (order_id, order_date, ship_date, ship_method, customer_id, product_id, sales)
                    VALUES ('{order_id}', '{order_date}', '{ship_date}', '{ship_method}', '{customer_id}', '{product_id}', '{sales}')
                    ON DUPLICATE KEY UPDATE order_date = '{order_date}', ship_date = '{ship_date}', ship_method = '{ship_method}', customer_id = '{customer_id}', product_id = '{product_id}', sales = '{sales}';
                   """)