In [1]:

import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3 as sql
import os
from sqlite3 import Error

In [2]:
# Try different encodings
try:
    e_com = pd.read_csv('Sample-Superstore.csv', encoding='utf-8')
except UnicodeDecodeError:
    # Try alternative encodings
    e_com = pd.read_csv('Sample-Superstore.csv', encoding='latin-1')


In [3]:
E_Commerce = e_com

In [4]:
E_Commerce.shape

(9994, 21)

In [5]:
E_Commerce.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [6]:
E_Commerce['Ship Mode']

0         Second Class
1         Second Class
2         Second Class
3       Standard Class
4       Standard Class
             ...      
9989      Second Class
9990    Standard Class
9991    Standard Class
9992    Standard Class
9993      Second Class
Name: Ship Mode, Length: 9994, dtype: object

In [7]:
#change the column names to lowercase and replace spaces with underscores
E_Commerce.columns = E_Commerce.columns.str.lower().str.replace(' ', '_')

In [8]:
#find missing null values
E_Commerce.isnull().sum()

#drop missing values
E_Commerce = E_Commerce.dropna()

In [9]:
# Convert started_at and ended_at to datetime for memory efficiency
E_Commerce['order_date'] = pd.to_datetime(E_Commerce['order_date'])
E_Commerce['ship_date'] = pd.to_datetime(E_Commerce['ship_date'])

# FUTURE ENGINEERING 

## ADDING NEW COLUMNS 

In [10]:
# Extract month from OrderDate
E_Commerce['month'] = E_Commerce['order_date'].dt.month
E_Commerce['year'] = E_Commerce['order_date'].dt.year
E_Commerce['monthname'] = E_Commerce['order_date'].dt.month_name()





In [11]:
E_Commerce.columns

Index(['row_id', 'order_id', 'order_date', 'ship_date', 'ship_mode',
       'customer_id', 'customer_name', 'segment', 'country', 'city', 'state',
       'postal_code', 'region', 'product_id', 'category', 'sub-category',
       'product_name', 'sales', 'quantity', 'discount', 'profit', 'month',
       'year', 'monthname'],
      dtype='object')

In [12]:
# Define a function to map months to seasons
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Apply function to create 'Season' column
E_Commerce['season'] = E_Commerce['order_date'].dt.month.apply(get_season)




In [13]:


# Add new 'ship_id' column
E_Commerce['ship_id'] = None  # Or assign default values
E_Commerce['region_id'] = None
E_Commerce['profit_id'] = None


In [14]:
#Adding new column for Total Revenue
E_Commerce['total_revenue'] = E_Commerce['sales'] * E_Commerce['quantity']

In [15]:
#Adding new column for Profit and Loss
E_Commerce['status'] = E_Commerce['profit'].apply(lambda x: 'Profit' if x > 0 else 'Loss')

In [16]:
# Check for duplicates in OrderID and ProductID
duplicates = E_Commerce[E_Commerce.duplicated(subset=["order_id", "product_id"], keep=False)]

# Display duplicates
print(duplicates)

      row_id        order_id order_date  ship_date       ship_mode  \
350      351  CA-2016-129714 2016-09-01 2016-09-03     First Class   
352      353  CA-2016-129714 2016-09-01 2016-09-03     First Class   
430      431  US-2016-123750 2016-04-15 2016-04-21  Standard Class   
431      432  US-2016-123750 2016-04-15 2016-04-21  Standard Class   
1300    1301  CA-2016-137043 2016-12-23 2016-12-25    Second Class   
1301    1302  CA-2016-137043 2016-12-23 2016-12-25    Second Class   
3183    3184  CA-2017-152912 2017-11-09 2017-11-12    Second Class   
3184    3185  CA-2017-152912 2017-11-09 2017-11-12    Second Class   
3405    3406  US-2014-150119 2014-04-23 2014-04-27  Standard Class   
3406    3407  US-2014-150119 2014-04-23 2014-04-27  Standard Class   
6498    6499  CA-2015-103135 2015-07-24 2015-07-28  Standard Class   
6500    6501  CA-2015-103135 2015-07-24 2015-07-28  Standard Class   
7881    7882  CA-2017-118017 2017-12-03 2017-12-06    Second Class   
7882    7883  CA-201

In [17]:
# Rename columns in the DataFrame
E_Commerce = E_Commerce.rename(columns={
    'segment': 'segments',
    'sub-category': 'sub_category',
    'sales': 'sale',
    'ship_month': 'ship_month_name'
})

In [18]:
# Rename columns in the DataFrame
E_Commerce = E_Commerce.rename(columns={
    'sub-category': 'sub_category',  # Rename 'sub-category' to 'sub_category'
    'segments': 'segment'            # Rename 'segment' to 'segments'
})

In [19]:
E_Commerce['status']

0       Profit
1       Profit
2       Profit
3         Loss
4       Profit
         ...  
9989    Profit
9990    Profit
9991    Profit
9992    Profit
9993    Profit
Name: status, Length: 9994, dtype: object

# Login To SQL

In [26]:
import sqlite3

try:
    # Create SQLite connection
    conn = sqlite3.connect("superstore.db")
    cursor = conn.cursor()

    # Create Tables
    cursor.executescript('''
    -- Customers Table
    CREATE TABLE IF NOT EXISTS Customers (
        customer_id VARCHAR(20) PRIMARY KEY,
        customer_name VARCHAR(100)
    );

    -- Orders Table
    CREATE TABLE IF NOT EXISTS Orders (
        order_id VARCHAR(20) PRIMARY KEY,
        order_date DATE,
        year INT,
        monthname VARCHAR(50),
        month VARCHAR(50),
        season VARCHAR(50),
        customer_id VARCHAR(20),
        FOREIGN KEY (customer_id) REFERENCES Customers(customer_id)
    );

    -- Products Table
    CREATE TABLE IF NOT EXISTS Products (
        product_id VARCHAR(20) PRIMARY KEY,
        product_name VARCHAR(255),
        category VARCHAR(50),
        sub_category VARCHAR(50),
        segment VARCHAR(50)
    );

    -- Shipment Table
    CREATE TABLE IF NOT EXISTS Shipment (
        ship_id INTEGER PRIMARY KEY AUTOINCREMENT,
        ship_date DATE,
        ship_mode VARCHAR(20),
        order_id VARCHAR(20),
        FOREIGN KEY (order_id) REFERENCES Orders(order_id)
    );

    -- Regions Table
    CREATE TABLE IF NOT EXISTS Regions (
        region_id INTEGER PRIMARY KEY AUTOINCREMENT,
        country VARCHAR(100),
        region VARCHAR(50),
        state VARCHAR(50),
        city VARCHAR(100),
        postal_code INT
    );

    -- Profits Table
    CREATE TABLE IF NOT EXISTS Profits (
        profit_id INTEGER PRIMARY KEY AUTOINCREMENT,
        profit DECIMAL(10, 2),
        sale DECIMAL(10, 2),
        total_revenue DECIMAL(10, 2),
        status VARCHAR(100),
        discount DECIMAL(5, 2),
        quantity INT,
        product_id VARCHAR(20),
        FOREIGN KEY (product_id) REFERENCES Products(product_id)
    );
    ''')

    # Commit changes
    conn.commit()

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    # Close connection
    if conn:
        conn.close()

In [27]:
import pandas as pd
import sqlite3
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

def populate_database_tables(conn, E_Commerce):
    """
    Populates the database tables with data from the enriched DataFrame.
    
    Args:
        conn (sqlite3.Connection): The database connection
        E_Commerce (pandas.DataFrame): Enriched DataFrame with the column names as shown
        
    Returns:
        sqlite3.Connection: The database connection
    """
    cursor = conn.cursor()
    
    try:
        # Disable foreign key checks for performance
        cursor.execute("PRAGMA foreign_keys = OFF;")

        # Customers
        customers_df = E_Commerce[['customer_id', 'customer_name']].drop_duplicates()
        customers_df = customers_df.where(pd.notna(customers_df), None)
        cursor.executemany(
            "INSERT OR IGNORE INTO Customers (customer_id, customer_name) VALUES (?, ?)",
            customers_df.to_records(index=False)
        )

        # Orders
        orders_df = E_Commerce[['order_id', 'order_date', 'year', 'monthname', 'month', 'season', 'customer_id']].drop_duplicates()
        orders_df['order_date'] = orders_df['order_date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
        cursor.executemany(
            "INSERT OR IGNORE INTO Orders (order_id, order_date, year, monthname, month, season, customer_id) VALUES (?, ?, ?, ?, ?, ?, ?)",
            orders_df.to_records(index=False)
        )

        # Products
        products_df = E_Commerce[['product_id', 'product_name', 'category', 'sub_category', 'segment']].drop_duplicates()
        cursor.executemany(
            "INSERT OR IGNORE INTO Products (product_id, product_name, category, sub_category, segment) VALUES (?, ?, ?, ?, ?)",
            products_df.to_records(index=False)
        )

        # Regions
        regions_df = E_Commerce[['region_id', 'country', 'region', 'state', 'city', 'postal_code']].drop_duplicates()
        cursor.executemany(
            "INSERT OR IGNORE INTO Regions (region_id, country, region, state, city, postal_code) VALUES (?, ?, ?, ?, ?, ?)",
            regions_df.to_records(index=False)
        )

        # Profits
        profits_df = E_Commerce[['profit_id', 'profit', 'sale', 'total_revenue', 'status', 'discount', 'quantity', 'product_id']].drop_duplicates()
        cursor.executemany(
            "INSERT OR IGNORE INTO Profits (profit_id, profit, sale, total_revenue, status, discount, quantity, product_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
            profits_df.to_records(index=False)
        )

        # Shipment
        shipment_df = E_Commerce[['ship_id', 'ship_date', 'ship_mode', 'order_id']].drop_duplicates()
        shipment_df['ship_date'] = shipment_df['ship_date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
        cursor.executemany(
            "INSERT OR IGNORE INTO Shipment (ship_id, ship_date, ship_mode, order_id) VALUES (?, ?, ?, ?)",
            shipment_df.to_records(index=False)
        )

        # Re-enable foreign key checks
        cursor.execute("PRAGMA foreign_keys = ON;")

        # Commit changes
        conn.commit()
        logging.info("✅ Tables populated successfully!")
    
    except Exception as e:
        logging.error(f"⚠️ An error occurred: {e}")
    
    return conn

In [28]:
# Check column names
print(E_Commerce.columns)

# Inspect the first few rows
print(E_Commerce.head())

# Check for missing values
print(E_Commerce.isnull().sum())

Index(['row_id', 'order_id', 'order_date', 'ship_date', 'ship_mode',
       'customer_id', 'customer_name', 'segment', 'country', 'city', 'state',
       'postal_code', 'region', 'product_id', 'category', 'sub_category',
       'product_name', 'sale', 'quantity', 'discount', 'profit', 'month',
       'year', 'monthname', 'season', 'ship_id', 'region_id', 'profit_id',
       'total_revenue', 'status'],
      dtype='object')
   row_id        order_id order_date  ship_date       ship_mode customer_id  \
0       1  CA-2016-152156 2016-11-08 2016-11-11    Second Class    CG-12520   
1       2  CA-2016-152156 2016-11-08 2016-11-11    Second Class    CG-12520   
2       3  CA-2016-138688 2016-06-12 2016-06-16    Second Class    DV-13045   
3       4  US-2015-108966 2015-10-11 2015-10-18  Standard Class    SO-20335   
4       5  US-2015-108966 2015-10-11 2015-10-18  Standard Class    SO-20335   

     customer_name    segment        country             city  ...    profit  \
0      Claire Gute

In [29]:
# Open the database connection
conn = sqlite3.connect("superstore.db")

# Call the function to populate tables
populate_database_tables(conn, E_Commerce)

# Close the connection after the function completes
conn.close()

INFO:root:✅ Tables populated successfully!


In [31]:
# conn = sqlite3.connect("superstore.db")
# cursor = conn.cursor()

# # Query Customers Table
# print("Customers Table:")
# cursor.execute("SELECT * FROM Customers")
# print(cursor.fetchall())

# # Query Orders Table
# print("\nOrders Table:")
# cursor.execute("SELECT * FROM Orders")
# print(cursor.fetchall())

# # Query Products Table
# print("\nProducts Table:")
# cursor.execute("SELECT * FROM Products")
# print(cursor.fetchall())

# # Query Regions Table
# print("\nRegions Table:")
# cursor.execute("SELECT * FROM Regions")
# print(cursor.fetchall())

# # Query Profits Table
# print("\nProfits Table:")
# cursor.execute("SELECT * FROM Profits")
# print(cursor.fetchall())

# # Query Shipment Table
# print("\nShipment Table:")
# cursor.execute("SELECT * FROM Shipment")
# print(cursor.fetchall())

# conn.close()

In [25]:
E_Commerce.to_csv('E_Commerce.csv', index=False)